diff --git a/.github/ISSUE_TEMPLATE/---document-issue-.md b/.github/ISSUE_TEMPLATE/---document-issue-.md index 7c464ac584bc87cb16a796bf41acdcd79b8bd6f0..ffc2fcd7817b64584637a646edf5907612a7bbaf 100644 --- a/.github/ISSUE_TEMPLATE/---document-issue-.md +++ b/.github/ISSUE_TEMPLATE/---document-issue-.md @@ -56,4 +56,4 @@ For example: no sample code; The sample code is not helpful; The sample code not For example:Chinese API in this doc is inconsistent with English API, including params, description, sample code, formula, etc. #### Other -For example: The doc link is broken; The doc page is missing; Dead link in docs. \ No newline at end of file +For example: The doc link is broken; The doc page is missing; Dead link in docs. diff --git a/CMakeLists.txt b/CMakeLists.txt index f24513d605c49b608cb32425a861448a3acd6c6a..f30671bd3a87e87732b3a047e91811452370e06e 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,7 @@ # limitations under the License cmake_minimum_required(VERSION 3.10) +cmake_policy(VERSION 3.10) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) @@ -21,9 +22,6 @@ include(system) project(paddle CXX C) -include(init) -include(generic) # simplify cmake module - # enable language CUDA # TODO(Shibo Tao): remove find_package(CUDA) completely. find_package(CUDA QUIET) @@ -32,16 +30,23 @@ option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF) option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF) option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF) option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF) +option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF) +# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON +# to develop some acl related functionality on x86 +option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND}) +option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF) +# Note(zhouwei): It use option above, so put here +include(init) +include(generic) # simplify cmake module + if (WITH_GPU AND WITH_XPU) message(FATAL_ERROR "Error when compile GPU and XPU at the same time") endif() -if (WITH_GPU AND WITH_ASCEND) +if (WITH_GPU AND WITH_ASCEND) message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time") endif() -# cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them. -if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15)) - message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. " - "You can use cmake 3.16 (recommended), 3.10, 3.11, 3.15 or 3.17. Please refer to the install document: https://cmake.org/install/") +if (WITH_GPU AND WITH_ROCM) + message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time") endif() if(WITH_GPU AND NOT APPLE) @@ -61,6 +66,10 @@ if(WITH_MUSL) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy") endif() +if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") +endif() + if(WIN32) option(MSVC_STATIC_CRT "use static C Runtime library by default" ON) @@ -72,6 +81,13 @@ if(WIN32) set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj") + if("${CMAKE_GENERATOR}" STREQUAL "Ninja") + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /Zc:inline") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zc:inline") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Zc:inline") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zc:inline") + endif() + if (MSVC_STATIC_CRT) message(STATUS "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /MTd") @@ -89,8 +105,8 @@ if(WIN32) endforeach(flag_var) endif() - # NOTE(Avin0323): Less parallel count result in faster compilation. math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3") + # windows build turn off warnings, use parallel compiling. foreach(flag_var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE @@ -98,7 +114,10 @@ if(WIN32) CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}") - set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}") + # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling + if(NOT WITH_GPU) + set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}") + endif() endforeach(flag_var) foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS) set(${flag_var} "${${flag_var}} /w") @@ -116,6 +135,13 @@ if(WIN32) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838") + foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS) + set(${flag_var} "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221") + if(MSVC_STATIC_CRT) + set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB") + endif() + endforeach(flag_var) + if (WITH_WIN_DUMP_DBG) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Zi") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi") @@ -153,8 +179,6 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(ON_INFER "Turn on inference optimization and inference-lib generation" OFF) ################################ Internal Configurations ####################################### -option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF) -option(WITH_RCCL "Compile PaddlePaddle with RCCL support" OFF) option(WITH_NV_JETSON "Compile PaddlePaddle with NV JETSON" OFF) option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF) option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) @@ -165,14 +189,15 @@ option(WITH_PSLIB "Compile with pslib support" OFF) option(WITH_BOX_PS "Compile with box_ps support" OFF) option(WITH_XBYAK "Compile with xbyak support" ON) option(WITH_CONTRIB "Compile the third-party contributation" OFF) -option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE}) +option(WITH_HETERPS "Compile with heterps" OFF}) option(WITH_INFERENCE_API_TEST "Test fluid inference C++ high-level api interface" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}) option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF) option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF) option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON) +option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON) option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF) option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON) option(WITH_ARM "Compile PaddlePaddle with arm support" OFF) @@ -180,6 +205,7 @@ option(WITH_SW "Compile PaddlePaddle with sw support" OFF) option(WITH_MIPS "Compile PaddlePaddle with mips support" OFF) option(WITH_MUSL "Compile with musl libc instead of gblic" OFF) option(WITH_UNITY_BUILD "Compile with UnityBuild mode" OFF) +option(WITH_STRIP "Strip so files of Whl packages" OFF) # PY_VERSION if(NOT PY_VERSION) @@ -240,9 +266,6 @@ endif() if(WITH_BRPC_RDMA) message(STATUS "Use brpc with rdma.") - if(WITH_GRPC) - message(FATAL_ERROR "Can't use grpc with brpc rdma.") - endif() if(NOT WITH_DISTRIBUTE) message(FATAL_ERROR "Can't use brpc rdma in no distribute env.") endif() @@ -290,9 +313,9 @@ endif(WITH_ROCM) if (NOT WITH_ROCM AND WITH_RCCL) MESSAGE(WARNING - "Disable RCCL when compiling without GPU. Force WITH_RCCL=OFF.") - set(WITH_NCCL OFF CACHE STRING - "Disable RCCL when compiling without GPU" FORCE) + "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.") + set(WITH_RCCL OFF CACHE STRING + "Disable RCCL when compiling without ROCM" FORCE) endif() if(WITH_RCCL) @@ -330,6 +353,11 @@ if (WITH_MIPS) add_definitions(-DPADDLE_WITH_MIPS) endif() +if (WITH_HETERPS) + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new") + endif() +endif() set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") @@ -347,6 +375,13 @@ else() message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.") endif() +if(WITH_STRIP) + find_program(STRIP_PATH strip) + if(NOT STRIP_PATH OR NOT LINUX) + set(WITH_STRIP OFF CACHE STRING "Command strip is only used on Linux when it exists." FORCE) + endif() +endif() + add_subdirectory(paddle) if(WITH_PYTHON) add_subdirectory(python) diff --git a/README.md b/README.md index e8a7013d0b4432bc871843b83cf19494ca870cbc..8b437e4115abe80073866f52f3d7e387e2a554d3 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ - -

+

diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 9c1bd52e7fb7dfad5f6dc36d850468bf69ee92cd..e7f125269be1f5e015c6cf015489c312538ca4ba 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -82,6 +82,10 @@ if(WITH_ASCEND) add_definitions(-DPADDLE_WITH_ASCEND) endif() +if(WITH_ASCEND_CL) + add_definitions(-DPADDLE_WITH_ASCEND_CL) +endif() + if(WITH_XPU) message(STATUS "Compile with XPU!") add_definitions(-DPADDLE_WITH_XPU) @@ -93,13 +97,18 @@ if(WITH_GPU) FIND_PACKAGE(CUDA REQUIRED) - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 7) - message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 10.1) + message(FATAL_ERROR "Paddle needs CUDA >= 10.1 to compile") endif() if(NOT CUDNN_FOUND) message(FATAL_ERROR "Paddle needs cudnn to compile") endif() + + if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) + message(FATAL_ERROR "Paddle needs CUDNN >= 7.0 to compile") + endif() + if(CUPTI_FOUND) include_directories(${CUPTI_INCLUDE_DIR}) add_definitions(-DPADDLE_WITH_CUPTI) @@ -164,10 +173,9 @@ if(WITH_PSCORE) add_definitions(-DPADDLE_WITH_PSCORE) endif() - -if(WITH_GRPC) - add_definitions(-DPADDLE_WITH_GRPC) -endif(WITH_GRPC) +if(WITH_HETERPS) + add_definitions(-DPADDLE_WITH_HETERPS) +endif() if(WITH_BRPC_RDMA) add_definitions(-DPADDLE_WITH_BRPC_RDMA) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 2f4f5449f482d71a2a27957af4b5f17601ab634f..7f2addb02d36ddf85cd08542cc5baab31d495bc5 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -6,15 +6,9 @@ endif() if (WITH_NV_JETSON) add_definitions(-DWITH_NV_JETSON) set(paddle_known_gpu_archs "53 62 72") - set(paddle_known_gpu_archs7 "53") - set(paddle_known_gpu_archs8 "53 62") - set(paddle_known_gpu_archs9 "53 62") set(paddle_known_gpu_archs10 "53 62 72") else() - set(paddle_known_gpu_archs "30 35 50 52 60 61 70") - set(paddle_known_gpu_archs7 "30 35 50 52") - set(paddle_known_gpu_archs8 "30 35 50 52 60 61") - set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70") + set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80") set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75") set(paddle_known_gpu_archs11 "52 60 61 70 75 80") endif() @@ -74,7 +68,7 @@ endfunction() # select_nvcc_arch_flags(out_variable) function(select_nvcc_arch_flags out_variable) # List of arch names - set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual") + set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "Ampere" "All" "Manual") set(archs_name_default "Auto") list(APPEND archs_names "Auto") @@ -91,7 +85,7 @@ function(select_nvcc_arch_flags out_variable) if(${CUDA_ARCH_NAME} STREQUAL "Manual") set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") - set(CUDA_ARCH_PTX "50" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") + set(CUDA_ARCH_PTX "" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX) else() unset(CUDA_ARCH_BIN CACHE) @@ -108,6 +102,8 @@ function(select_nvcc_arch_flags out_variable) set(cuda_arch_bin "70") elseif(${CUDA_ARCH_NAME} STREQUAL "Turing") set(cuda_arch_bin "75") + elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere") + set(cuda_arch_bin "80") elseif(${CUDA_ARCH_NAME} STREQUAL "All") set(cuda_arch_bin ${paddle_known_gpu_archs}) elseif(${CUDA_ARCH_NAME} STREQUAL "Auto") @@ -158,31 +154,21 @@ function(select_nvcc_arch_flags out_variable) endfunction() message(STATUS "CUDA detected: " ${CMAKE_CUDA_COMPILER_VERSION}) -if (${CMAKE_CUDA_COMPILER_VERSION} LESS 7.0) - set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) -elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 8.0) # CUDA 7.x - set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") -elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 9.0) # CUDA 8.x - set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) +if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x + set(paddle_known_gpu_archs ${paddle_known_gpu_archs10}) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") - # CUDA 8 may complain that sm_20 is no longer supported. Suppress the - # warning for now. set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") -elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) # CUDA 9.x - set(paddle_known_gpu_archs ${paddle_known_gpu_archs9}) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") -elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x - set(paddle_known_gpu_archs ${paddle_known_gpu_archs10}) +elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.2) # CUDA 11.0/11.1 + set(paddle_known_gpu_archs ${paddle_known_gpu_archs11}) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") -elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.x - set(paddle_known_gpu_archs ${paddle_known_gpu_archs11}) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") +elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.2+ + set(paddle_known_gpu_archs "${paddle_known_gpu_archs11} 86") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") endif() if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) @@ -198,14 +184,11 @@ select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA}") message(STATUS "NVCC_FLAGS_EXTRA: ${NVCC_FLAGS_EXTRA}") -# Set C++11 support +# Set C++14 support set(CUDA_PROPAGATE_HOST_FLAGS OFF) # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. # So, don't set these flags here. -if (NOT WIN32) # windows msvc2015 support c++11 natively. - # -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake. - set(CMAKE_CUDA_STANDARD 11) -endif(NOT WIN32) +set(CMAKE_CUDA_STANDARD 14) # (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w # So replace /W[1-4] with /W0 diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index d8d8f634e76b6bf05d4936921ce37c889a4bdc7c..c82847100abefa6fcbaf1367965699413aadcadb 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -94,7 +94,7 @@ macro(find_cudnn_version cudnn_header_file) "${CUDNN_MAJOR_VERSION} * 1000 + ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}") message(STATUS "Current cuDNN header is ${cudnn_header_file} " - "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ") + "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCHLEVEL_VERSION}. ") endif() endif() endmacro() diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake index bcf0c0a0646fc386f41c4b1f35ba773d6a1adb6f..414b2a54be0342b3ef76d5e3a553577cb5f3e4be 100644 --- a/cmake/external/ascend.cmake +++ b/cmake/external/ascend.cmake @@ -12,50 +12,78 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) - -SET(ASCEND_PROJECT "extern_ascend") -IF((NOT DEFINED ASCEND_VER) OR (NOT DEFINED ASCEND_URL)) - MESSAGE(STATUS "use pre defined download url") - SET(ASCEND_VER "0.1.1" CACHE STRING "" FORCE) - SET(ASCEND_NAME "ascend" CACHE STRING "" FORCE) - SET(ASCEND_URL "http://paddle-ascend.bj.bcebos.com/ascend.tar.gz" CACHE STRING "" FORCE) -ENDIF() -MESSAGE(STATUS "ASCEND_NAME: ${ASCEND_NAME}, ASCEND_URL: ${ASCEND_URL}") -SET(ASCEND_SOURCE_DIR "${THIRD_PARTY_PATH}/ascend") -SET(ASCEND_DOWNLOAD_DIR "${ASCEND_SOURCE_DIR}/src/${ASCEND_PROJECT}") -SET(ASCEND_DST_DIR "ascend") -SET(ASCEND_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") -SET(ASCEND_INSTALL_DIR ${ASCEND_INSTALL_ROOT}/${ASCEND_DST_DIR}) -SET(ASCEND_ROOT ${ASCEND_INSTALL_DIR}) -SET(ASCEND_INC_DIR ${ASCEND_ROOT}/include) -SET(ASCEND_LIB_DIR ${ASCEND_ROOT}/lib) -SET(ASCEND_LIB ${ASCEND_LIB_DIR}/libge_runner.so) -SET(ASCEND_GRAPH_LIB ${ASCEND_LIB_DIR}/libgraph.so) -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${ASCEND_ROOT}/lib") - -INCLUDE_DIRECTORIES(${ASCEND_INC_DIR}) -FILE(WRITE ${ASCEND_DOWNLOAD_DIR}/CMakeLists.txt - "PROJECT(ASCEND)\n" - "cmake_minimum_required(VERSION 3.0)\n" - "install(DIRECTORY ${ASCEND_NAME}/include ${ASCEND_NAME}/lib \n" - " DESTINATION ${ASCEND_DST_DIR})\n") -ExternalProject_Add( - ${ASCEND_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${ASCEND_SOURCE_DIR} - DOWNLOAD_DIR ${ASCEND_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${ASCEND_URL} -c -q -O ${ASCEND_NAME}.tar.gz - && tar zxvf ${ASCEND_NAME}.tar.gz - DOWNLOAD_NO_PROGRESS 1 - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ASCEND_INSTALL_ROOT} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ASCEND_INSTALL_ROOT} -) -ADD_LIBRARY(ascend SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET ascend PROPERTY IMPORTED_LOCATION ${ASCEND_LIB}) - -ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${ASCEND_GRAPH_LIB}) -ADD_DEPENDENCIES(ascend ascend_graph ${ASCEND_PROJECT}) +#NOTE: Logic is from +# https://github.com/mindspore-ai/graphengine/blob/master/CMakeLists.txt +if(DEFINED ENV{ASCEND_CUSTOM_PATH}) + set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH}) +else() + set(ASCEND_DIR /usr/local/Ascend) +endif() + +if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h) + # It means CANN 20.2 + + add_definitions(-DPADDLE_WITH_ASCEND_STRING) +endif() + + +if(WITH_ASCEND OR WITH_ASCEND_CL) + set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64) + set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common) + set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share) + set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64) + set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64) + set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64) + set(STATIC_ACL_LIB ${ASCEND_ACL_DIR}) + + set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR}) + set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR}) + set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) + set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) + set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64) + set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64) + set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR}) + + set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so) + set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so) + set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so) + INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR}) + + + ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib}) + + ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib}) + + ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib}) + + add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl) +endif() + +if(WITH_ASCEND_CL) + set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) + + set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so) + set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so) + set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so) + set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) + set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include) + + message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}") + message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}") + INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR}) + INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR}) + + ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib}) + + ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib}) + + ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib}) + add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler) + +endif() diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake index 0eb590c42d0cb73ccb252430bc3e27312b0bddf9..2d72b6eb56deaa2547051756afc075a100aeb251 100644 --- a/cmake/external/brpc.cmake +++ b/cmake/external/brpc.cmake @@ -39,9 +39,9 @@ set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/ ExternalProject_Add( extern_brpc ${EXTERNAL_PROJECT_LOG_ARGS} - # TODO(gongwb): change to de newst repo when they changed. + # TODO(gongwb): change to de newst repo when they changed GIT_REPOSITORY "https://github.com/wangjiawei04/brpc" - GIT_TAG "6d79e0b17f25107c35b705ea58d888083f59ff47" + GIT_TAG "e203afb794caf027da0f1e0776443e7d20c0c28e" PREFIX ${BRPC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 5a755a816c332a2517ed61caa94d647afd557aae..aa471002eacb6a61a9cf835f293a86a75d87db8f 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -14,11 +14,11 @@ include(ExternalProject) -# update eigen to the commit id 4da2c6b1 on 03/19/2020 +# update eigen to the commit id f612df27 on 03/16/2021 set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3) set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3) set(EIGEN_REPOSITORY https://gitlab.com/libeigen/eigen.git) -set(EIGEN_TAG 4da2c6b1974827b1999bab652a3d4703e1992d26) +set(EIGEN_TAG f612df273689a19d25b45ca4f8269463207c4fee) cache_third_party(extern_eigen3 REPOSITORY ${EIGEN_REPOSITORY} @@ -27,47 +27,15 @@ cache_third_party(extern_eigen3 if(WIN32) add_definitions(-DEIGEN_STRONG_INLINE=inline) - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Half.h native_src) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/arch/CUDA/Half.h native_dst) - # For Windows - # which will cause a compilation error in Tensor:74: - # "can not open file 'unistd.h'" - # so use following patch to solve compilation error On Windows. - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Tensor native_src2) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/Tensor native_dst2) - # For VS2015 - # which will cause a compilation error in TensorBlock.h:1028: - # "syntax error" - # so use following patch to solve compilation error On Windows. - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorBlock.h native_src3) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h native_dst3) - set(EIGEN_PATCH_COMMAND copy ${native_src} ${native_dst} /Y && copy ${native_src2} ${native_dst2} /Y && copy ${native_src3} ${native_dst3} /Y) elseif(LINUX) - # For gxx=4.8, __GXX_ABI_VERSION is less than 1004 - # which will cause a compilation error in Geometry_SSE.h:38: - # "no matching function for call to 'pmul(Eigen::internal::Packet4f&, __m128)" - # refer to: https://gitlab.com/libeigen/eigen/-/blob/4da2c6b1974827b1999bab652a3d4703e1992d26/Eigen/src/Core/arch/SSE/PacketMath.h#L33-60 - # add -fabi-version=4 could avoid above error, but will cause "double free corruption" when compile with gcc8 - # so use following patch to solve compilation error with different version of gcc. - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Geometry_SSE.h native_src1) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Geometry/arch/Geometry_SSE.h native_dst1) - # The compiler fully support const expressions since c++14, - # but Eigen use some const expressions such as std::max and std::min, which are not supported in c++11 - # add patch to avoid compilation error in c++11 - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/MathFunctions.h native_src2) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/MathFunctions.h native_dst2) if(WITH_ROCM) # For HIPCC Eigen::internal::device::numeric_limits is not EIGEN_DEVICE_FUNC # which will cause compiler error of using __host__ funciont in __host__ __device__ - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src3) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst3) - # For HIPCC Eigen::internal::scalar_sum_op is not EIGEN_DEVICE_FUNC - # which will cause compiler error of using __host__ funciont in __host__ __device__ - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/BinaryFunctors.h native_src4) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/functors/BinaryFunctors.h native_dst4) - set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2} && cp ${native_src3} ${native_dst3} && cp ${native_src4} ${native_dst4}) - else() - set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2}) + file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src) + file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst) + file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorReductionGpu.h native_src1) + file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h native_dst1) + set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst} && cp ${native_src1} ${native_dst1}) endif() endif() @@ -82,7 +50,7 @@ ExternalProject_Add( PREFIX ${EIGEN_PREFIX_DIR} SOURCE_DIR ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" - PATCH_COMMAND ${EIGEN_PATCH_COMMAND} + PATCH_COMMAND ${EIGEN_PATCH_COMMAND} CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake index ea7af315e1a690578bd16c89cc83a158dacca4cf..e8db13a694f5578e314dc1a7c95ed24ad88bad02 100644 --- a/cmake/external/gloo.cmake +++ b/cmake/external/gloo.cmake @@ -32,21 +32,39 @@ cache_third_party(extern_gloo TAG ${GLOO_TAG} DIR GLOO_SOURCE_DIR) -ExternalProject_Add( - extern_gloo - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - "${GLOO_DOWNLOAD_CMD}" - PREFIX "${GLOO_PREFIX_DIR}" - SOURCE_DIR "${GLOO_SOURCE_DIR}" - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND mkdir -p ${GLOO_SOURCE_DIR}/build - && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make - && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo - INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} - COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo" -) + if(WITH_ASCEND OR WITH_ASCEND_CL) + ExternalProject_Add( + extern_gloo + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + "${GLOO_DOWNLOAD_CMD}" + PREFIX "${GLOO_PREFIX_DIR}" + SOURCE_DIR "${GLOO_SOURCE_DIR}" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND mkdir -p ${GLOO_SOURCE_DIR}/build + && cd ${GLOO_SOURCE_DIR}/build && cmake .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && make + && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo" + ) +else() + ExternalProject_Add( + extern_gloo + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + "${GLOO_DOWNLOAD_CMD}" + PREFIX "${GLOO_PREFIX_DIR}" + SOURCE_DIR "${GLOO_SOURCE_DIR}" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND mkdir -p ${GLOO_SOURCE_DIR}/build + && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make + && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo" + ) +endif() ADD_LIBRARY(gloo STATIC IMPORTED GLOBAL) diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake deleted file mode 100644 index 536e95c1dc2a4fe6545bd5d3147631aa26cdda98..0000000000000000000000000000000000000000 --- a/cmake/external/grpc.cmake +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -include (ExternalProject) - -SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc) -SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc) -SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE) -SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE) - -include(ProcessorCount) -ProcessorCount(NUM_OF_PROCESSOR) - -IF(APPLE) - SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin | sed "s/-Werror//g" | sh) - SET(GRPC_INSTALL_CMD make prefix=${GRPC_INSTALL_DIR} install) -ELSE() - SET(GRPC_CFLAGS "-Wno-error -std=c11 ${CLFAGS}") - SET(GRPC_CXXFLAGS "-Wno-error -std=c++11 ${CXXFLAGS}") - SET(BUILD_CMD make CFLAGS=${GRPC_CFLAGS} CXXFLAGS=${GRPC_CXXFLAGS} HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin) - SET(GRPC_INSTALL_CMD make prefix=${GRPC_INSTALL_DIR} install CFLAGS=${GRPC_CFLAGS} CXXFLAGS=${GRPC_CXXFLAGS}) -ENDIF() - -# FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them -ExternalProject_Add( - extern_grpc - DEPENDS protobuf zlib - # NOTE(wuyi): - # this package is generated by following steps: - # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git - # 2. git submodule update --init - # 3. keep only zlib, cares, protobuf, boringssl under "third_party", - # checkout and clean other dirs under third_party - # 4. remove .git, and package the directory. - URL http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x_paddle.tar.gz - URL_MD5 f5442d137ddccee252e194b1bc90f98c - PREFIX ${GRPC_SOURCES_DIR} - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_IN_SOURCE 1 - # NOTE(yuyang18): - # Disable -Werror, otherwise the compile will fail in MacOS. - # It seems that we cannot configure that by make command. - # Just dry run make command and remove `-Werror`, then use a shell to run make commands - BUILD_COMMAND ${BUILD_CMD} - INSTALL_COMMAND ${GRPC_INSTALL_CMD} -) - -ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION - "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a") - -ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION - "${GRPC_INSTALL_DIR}/lib/libgrpc++.a") -ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION - "${GRPC_INSTALL_DIR}/lib/libgpr.a") - -ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION - "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a") - -include_directories(${GRPC_INCLUDE_DIR}) -ADD_DEPENDENCIES(grpc++_unsecure extern_grpc) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 884219d8dd81f30e17f7a86380947262014e402a..fb1d4d9d56dcc6f38a86242b4d78b88ef31ddaa0 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn) SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) SET(MKLDNN_REPOSITORY ${GIT_URL}/oneapi-src/oneDNN.git) -SET(MKLDNN_TAG 72efa005effb49595933e033cc732f215ef0445a) +SET(MKLDNN_TAG f58682cd8bd0615f41d879f8afc8f1511ab42d24) # Introduce variables: # * CMAKE_INSTALL_LIBDIR diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 40a27f506f3077a5a47289d20906f7c180681b65..c108c05368c915f6d4998d46713cda315dfb93ff 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -198,8 +198,16 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}") ENDIF() +if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) + SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) + SET(PROTOBUF_TAG v3.8.0) +elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11) + SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) + SET(PROTOBUF_TAG v3.8.0) +else() SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546) +endif() cache_third_party(${TARGET_NAME} REPOSITORY ${PROTOBUF_REPOSITORY} @@ -234,7 +242,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ) ENDFUNCTION() -SET(PROTOBUF_VERSION 3.1.0) +if(WITH_ASCEND OR WITH_ASCEND_CL) + SET(PROTOBUF_VERSION 3.8.0) +else() + SET(PROTOBUF_VERSION 3.1.0) +endif() IF(NOT PROTOBUF_FOUND) build_protobuf(extern_protobuf FALSE) diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake index 205e8d26d93ca1c25e5b59ecc3b063b4837db77b..f9cb3a9075a821025129c1f6acb479a4ad6ac95c 100644 --- a/cmake/external/threadpool.cmake +++ b/cmake/external/threadpool.cmake @@ -16,7 +16,11 @@ INCLUDE(ExternalProject) SET(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool) SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool) -SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git) +if(WITH_ASCEND OR WITH_ASCEND_CL) + SET(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git) +else() + SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git) +endif() SET(THREADPOOL_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040) cache_third_party(extern_threadpool diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 0ee3e2116a94b68d528a475a453d1c31f0464cf4..c591a9391dfa5d3b5a452ffbb5a5d3199d387519 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -14,11 +14,17 @@ INCLUDE(ExternalProject) +IF(WITH_ROCM) + add_definitions(-DWARPCTC_WITH_HIP) +ENDIF() + SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc) SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc) SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) +# in case of low internet speed +#set(WARPCTC_REPOSITORY https://gitee.com/tianjianhe/warp-ctc.git) set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git) -set(WARPCTC_TAG 95a461eddeabd51099ef059dcfada1117eb1bfb8) +set(WARPCTC_TAG c690fc5755abbdbdc98ef78d51ec10a6748a8cd1) SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE) @@ -37,38 +43,92 @@ cache_third_party(extern_warpctc TAG ${WARPCTC_TAG} DIR WARPCTC_SOURCE_DIR) -ExternalProject_Add( - extern_warpctc - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - "${WARPCTC_DOWNLOAD_CMD}" - PREFIX ${WARPCTC_PREFIX_DIR} - SOURCE_DIR ${WARPCTC_SOURCE_DIR} - #UPDATE_COMMAND "" - PATCH_COMMAND "" - BUILD_ALWAYS 1 - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} - -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} - -DWITH_GPU=${WITH_GPU} - -DWITH_OMP=${USE_OMP} - -DWITH_TORCH=OFF - -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON - -DBUILD_SHARED=ON - -DBUILD_TESTS=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} -) +if(WITH_ASCEND OR WITH_ASCEND_CL) + ExternalProject_Add( + extern_warpctc + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + "${WARPCTC_DOWNLOAD_CMD}" + PREFIX ${WARPCTC_PREFIX_DIR} + SOURCE_DIR ${WARPCTC_SOURCE_DIR} + #UPDATE_COMMAND "" + PATCH_COMMAND "" + BUILD_ALWAYS 1 + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} + -DWITH_GPU=${WITH_GPU} + -DWITH_ROCM=${WITH_ROCM} + -DWITH_OMP=${USE_OMP} + -DWITH_TORCH=OFF + -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON + -DBUILD_SHARED=ON + -DBUILD_TESTS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} + ) +else() + if(WIN32) + set(WARPCTC_C_FLAGS $) + set(WARPCTC_C_FLAGS_DEBUG $) + set(WARPCTC_C_FLAGS_RELEASE $) + set(WARPCTC_CXX_FLAGS $) + set(WARPCTC_CXX_FLAGS_RELEASE $) + set(WARPCTC_CXX_FLAGS_DEBUG $) + else() + set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS}) + set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) + set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE}) + set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) + set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) + endif() + ExternalProject_Add( + extern_warpctc + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + "${WARPCTC_DOWNLOAD_CMD}" + PREFIX ${WARPCTC_PREFIX_DIR} + SOURCE_DIR ${WARPCTC_SOURCE_DIR} + #UPDATE_COMMAND "" + PATCH_COMMAND "" + BUILD_ALWAYS 1 + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG} + -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} + -DWITH_GPU=${WITH_GPU} + -DWITH_ROCM=${WITH_ROCM} + -DWITH_OMP=${USE_OMP} + -DWITH_TORCH=OFF + -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON + -DBUILD_SHARED=ON + -DBUILD_TESTS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} + ) +endif() + + IF(WIN32) SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" CACHE FILEPATH "Warp-ctc Library" FORCE) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index b5a3f0154745b9425c3dfc45a129117238fa80de..f846623602ed79a5bd84268436a59ede1957364b 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -13,7 +13,7 @@ if(NOT XPU_SDK_ROOT) elseif(WITH_SUNWAY) SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE) else() - SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_02_27.tar.gz" CACHE STRING "" FORCE) + SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_04_09.tar.gz" CACHE STRING "" FORCE) endif() SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu") diff --git a/cmake/flags.cmake b/cmake/flags.cmake index e110524dd1abb864649daf8bd763e69ae87c600d..a2ddad557c2956f7de21bceaf7a6699e8dfbed43 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -4,10 +4,10 @@ include(CheckCCompilerFlag) include(CheckCXXSymbolExists) include(CheckTypeSize) -function(CheckCompilerCXX11Flag) +function(CheckCompilerCXX14Flag) if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8) - message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.") + if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.4) + message(FATAL_ERROR "Unsupported GCC version. GCC >= 5.4 required.") elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.2) message(WARNING "Found GCC ${CMAKE_CXX_COMPILER_VERSION} which is too high, recommended to use GCC 8.2") endif() @@ -20,23 +20,15 @@ function(CheckCompilerCXX11Flag) message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.") endif() else() - if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3) - message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.") + if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.4) + message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.4 required.") endif() endif() endif() endfunction() -CheckCompilerCXX11Flag() -if (WITH_GPU) - if (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") - else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") - endif() -else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") -endif() +CheckCompilerCXX14Flag() +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") # safe_set_flag # # Set a compile flag only if compiler is support diff --git a/cmake/generic.cmake b/cmake/generic.cmake index ba86cfabdf173467973b9d4337e6edbbe84c5889..a5c74a46631e9d76fa78261f706a1853a80bab32 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -447,9 +447,20 @@ function(cc_test TARGET_NAME) cc_test_build(${TARGET_NAME} SRCS ${cc_test_SRCS} DEPS ${cc_test_DEPS}) - cc_test_run(${TARGET_NAME} - COMMAND ${TARGET_NAME} - ARGS ${cc_test_ARGS}) + # we dont test hcom op, because it need complex configuration + # with more than one machine + if(NOT ("${TARGET_NAME}" STREQUAL "c_broadcast_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "c_allreduce_sum_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "c_allreduce_max_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "c_reducescatter_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "c_allgather_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "send_v2_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "c_reduce_sum_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "recv_v2_op_npu_test")) + cc_test_run(${TARGET_NAME} + COMMAND ${TARGET_NAME} + ARGS ${cc_test_ARGS}) + endif() endif() endfunction(cc_test) @@ -492,10 +503,8 @@ function(nv_library TARGET_NAME) message(FATAL "Please specify source file or library in nv_library.") endif() endif(nv_library_SRCS) - if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) - if(${MSVC_VERSION} LESS_EQUAL 1900) - set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS}) - endif() + if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910)) + set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS}) endif() endif() endfunction(nv_library) @@ -512,7 +521,7 @@ function(nv_binary TARGET_NAME) add_dependencies(${TARGET_NAME} ${nv_binary_DEPS}) common_link(${TARGET_NAME}) endif() - if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) + if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910)) set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS}) endif() endif() @@ -539,7 +548,7 @@ function(nv_test TARGET_NAME) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) - if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) + if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910)) set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS}) endif() endif() @@ -809,7 +818,7 @@ function(py_test TARGET_NAME) ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif() - + if (WIN32) set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150) endif() diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 2cba3d06936081097a773295c7f91e7aa53564a6..9694a7bc59c12a96e1c0c33488895ae94dbf2a03 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -192,6 +192,15 @@ include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io) copy(inference_lib_dist SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include/* DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/) +copy(inference_lib_dist + SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex64.h + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/) +copy(inference_lib_dist + SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex128.h + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/) +copy(inference_lib_dist + SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/float16.h + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/) # CAPI inference library for only inference set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING @@ -202,11 +211,11 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") if(WIN32) set(paddle_inference_c_lib $/paddle_inference_c.*) else(WIN32) - set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_inference_c.*) + set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi_exp/libpaddle_inference_c.*) endif(WIN32) copy(inference_lib_dist - SRCS ${src_dir}/inference/capi/paddle_c_api.h ${paddle_inference_c_lib} + SRCS ${src_dir}/inference/capi_exp/pd_*.h ${paddle_inference_c_lib} DSTS ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib) # fluid library for both train and inference diff --git a/cmake/init.cmake b/cmake/init.cmake index aea02088750df4edc71a4909489c8ba250c8bb64..b11156d2e9986f879dcf4dd63354edb81c493260 100644 --- a/cmake/init.cmake +++ b/cmake/init.cmake @@ -18,6 +18,10 @@ if(NOT WIN32) set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG") set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG") else() + # It can specify CUDA compile flag manualy, + # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous + # because CUDA will update by nvidia, then error will occur. + # Now, it's only used in VS2015 + CUDA:[10.0, 10.2] set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props) endif() diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 0343ff3cc292d97dcc77108735baa69c804468af..33390745cc8c96bc00b9eab84dfb637a8a76c2f9 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -11,6 +11,7 @@ function(op_library TARGET) set(cu_cc_srcs) set(hip_cc_srcs) set(xpu_cc_srcs) + set(npu_cc_srcs) set(cudnn_cu_cc_srcs) set(miopen_cu_cc_srcs) set(cudnn_cu_srcs) @@ -20,6 +21,9 @@ function(op_library TARGET) set(mkldnn_cc_srcs) set(MKLDNN_FILE) set(op_common_deps operator op_registry math_function layer common_infer_shape_functions) + if (WITH_ASCEND_CL) + set(op_common_deps ${op_common_deps} npu_op_runner) + endif() # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build. set(options UNITY) set(oneValueArgs "") @@ -40,6 +44,9 @@ function(op_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) list(APPEND cu_srcs ${TARGET}.cu) endif() + if (WITH_NV_JETSON) + list(REMOVE_ITEM cu_srcs "decode_jpeg_op.cu") + endif() if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE) @@ -85,6 +92,12 @@ function(op_library TARGET) list(APPEND xpu_cc_srcs ${XPU_FILE}.cc) endif() endif() + if(WITH_ASCEND_CL) + string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}") + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc) + list(APPEND npu_cc_srcs ${NPU_FILE}.cc) + endif() + endif() else() foreach(src ${op_library_SRCS}) if(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu$") @@ -107,6 +120,8 @@ function(op_library TARGET) list(APPEND cu_cc_srcs ${src}) elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$") list(APPEND xpu_cc_srcs ${src}) + elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$") + list(APPEND npu_cc_srcs ${src}) elseif(${src} MATCHES ".*\\.cc$") list(APPEND cc_srcs ${src}) else() @@ -168,15 +183,15 @@ function(op_library TARGET) list(REMOVE_ITEM miopen_cu_cc_srcs "affine_grid_cudnn_op.cu.cc") list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc") list(REMOVE_ITEM hip_srcs "cholesky_op.cu") - list(REMOVE_ITEM hip_srcs "correlation_op.cu") list(REMOVE_ITEM hip_srcs "multinomial_op.cu") + list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu") hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) else() # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`. if(WITH_UNITY_BUILD AND op_library_UNITY) # Combine the cc source files. - compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs}) + compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs}) if(TARGET ${UNITY_TARGET}) # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`. target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources}) @@ -187,7 +202,7 @@ function(op_library TARGET) # Add alias library to handle dependencies. add_library(${TARGET} ALIAS ${UNITY_TARGET}) else() - cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS} + cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) endif() endif() @@ -207,6 +222,7 @@ function(op_library TARGET) # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h. # Note that it's enough to just adding one operator to pybind in a *_op.cc file. # And for detail pybind information, please see generated paddle/pybind/pybind.h. + set(ORIGINAL_TARGET ${TARGET}) file(READ ${TARGET}.cc TARGET_CONTENT) string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}") # [ \t\r\n]* is used for blank characters @@ -239,8 +255,9 @@ function(op_library TARGET) list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) list(LENGTH xpu_cc_srcs xpu_cc_srcs_len) list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len) + list(LENGTH npu_cc_srcs npu_cc_srcs_len) if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND - ${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0) + ${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0 AND ${npu_cc_srcs_len} EQUAL 0) file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") set(pybind_flag 1) endif() @@ -280,6 +297,26 @@ function(op_library TARGET) if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n") endif() + + if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0) + file(READ ${ORIGINAL_TARGET}_npu.cc TARGET_NPU_CONTENT) + # It is different from the logic above, becareful + string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\(.*" multi_npu_register "${TARGET_NPU_CONTENT}") + # [ \t\r\n]* is used for blank characters + string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\([ \t\r\n]*[a-z0-9_]*," one_npu_register "${multi_npu_register}") + + if (one_npu_register STREQUAL "") + string(REPLACE "_op" "" NPU_TARGET "${TARGET}") + else () + string(REPLACE "REGISTER_OP_NPU_KERNEL(" "" NPU_TARGET "${one_npu_register}") + string(REPLACE "," "" NPU_TARGET "${NPU_TARGET}") + # [ \t\r\n]+ is used for blank characters. + # Here we use '+' instead of '*' since it is a REPLACE operation. + string(REGEX REPLACE "[ \t\r\n]+" "" NPU_TARGET "${NPU_TARGET}") + endif() + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${NPU_TARGET}, NPU);\n") + endif() + # pybind USE_OP_DEVICE_KERNEL for MKLDNN if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) # Append first implemented MKLDNN activation operator @@ -330,6 +367,7 @@ function(register_operators) file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") string(REPLACE "_mkldnn" "" OPS "${OPS}") string(REPLACE "_xpu" "" OPS "${OPS}") + string(REPLACE "_npu" "" OPS "${OPS}") string(REPLACE ".cc" "" OPS "${OPS}") list(REMOVE_DUPLICATES OPS) list(LENGTH register_operators_DEPS register_operators_DEPS_len) diff --git a/cmake/paddle_win.props b/cmake/paddle_win.props index 0115ad4b59fc466ea10be6912257c40d31ed3640..3c069bd2981c437a1450ede29db2449dc46a9a4a 100644 --- a/cmake/paddle_win.props +++ b/cmake/paddle_win.props @@ -15,7 +15,7 @@ InheritFromHost -ccbin "%(VCBinDir)" -x cu [GenerateRelocatableDeviceCode] [Include] [RequiredIncludes] [InterleaveSourceInPTX] [GPUDebugInfo] [GenerateLineInfo] [Keep] [KeepDir] [MaxRegCount] [PtxAsOptionV] [TargetMachinePlatform] [NvccCompilation] [CudaRuntime] [AdditionalOptions] - --use-local-env --cl-version $(CudaClVersion) + --use-local-env $(CudaClVersion) [CodeGeneration] -clean @@ -88,4 +88,3 @@ set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)" - diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 6488d29afc5f7f4af72aab1cf2463d900a89fa9d..56edaff2a50dab0f7029ec1e85fc3d4ce8ac416e 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -29,9 +29,9 @@ set(third_party_deps) # 2. REPOSITORY: specify git REPOSITORY of 3rd party # 3. TAG: specify git tag/branch/commitID of 3rd party # 4. DIR: overwrite the original SOURCE_DIR when cache directory -# +# # The function Return 1 PARENT_SCOPE variables: -# - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add, +# - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add, # and you no longer need to set any donwnload steps in ExternalProject_Add. # For example: # Cache_third_party(${TARGET} @@ -52,7 +52,7 @@ FUNCTION(cache_third_party TARGET) SET(${TARGET_NAME}_DOWNLOAD_CMD GIT_REPOSITORY ${cache_third_party_REPOSITORY}) IF(cache_third_party_TAG) - LIST(APPEND ${TARGET_NAME}_DOWNLOAD_CMD + LIST(APPEND ${TARGET_NAME}_DOWNLOAD_CMD GIT_TAG ${cache_third_party_TAG}) ENDIF() ELSEIF(cache_third_party_URL) @@ -130,7 +130,7 @@ ENDFUNCTION() # Correction of flags on different Platform(WIN/MAC) and Print Warning Message if (APPLE) if(WITH_MKL) - MESSAGE(WARNING + MESSAGE(WARNING "Mac is not supported with MKL in Paddle yet. Force WITH_MKL=OFF.") set(WITH_MKL OFF CACHE STRING "Disable MKL for building on mac" FORCE) endif() @@ -141,7 +141,7 @@ if(WIN32 OR APPLE) SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE) if(WITH_LIBXSMM) - MESSAGE(WARNING + MESSAGE(WARNING "Windows, Mac are not supported with libxsmm in Paddle yet." "Force WITH_LIBXSMM=OFF") SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM in Windows and MacOS" FORCE) @@ -261,6 +261,14 @@ if(WITH_PSLIB) if(WITH_PSLIB_BRPC) include(external/pslib_brpc) # download, build, install pslib_brpc list(APPEND third_party_deps extern_pslib_brpc) + else() + include(external/snappy) + list(APPEND third_party_deps extern_snappy) + + include(external/leveldb) + list(APPEND third_party_deps extern_leveldb) + include(external/brpc) + list(APPEND third_party_deps extern_brpc) endif() endif(WITH_PSLIB) @@ -274,10 +282,15 @@ if(WITH_BOX_PS) list(APPEND third_party_deps extern_box_ps) endif(WITH_BOX_PS) -if(WITH_ASCEND) +if(WITH_ASCEND OR WITH_ASCEND_CL) include(external/ascend) - list(APPEND third_party_deps extern_ascend) -endif (WITH_ASCEND) + if(WITH_ASCEND OR WITH_ASCEND_CL) + list(APPEND third_party_deps extern_ascend) + endif() + if(WITH_ASCEND_CL) + list(APPEND third_party_deps extern_ascend_cl) + endif() +endif () if (WITH_PSCORE) include(external/snappy) @@ -285,7 +298,7 @@ if (WITH_PSCORE) include(external/leveldb) list(APPEND third_party_deps extern_leveldb) - + include(external/brpc) list(APPEND third_party_deps extern_brpc) diff --git a/go/README_cn.md b/go/README_cn.md index a184ecbb8dea1ae71074ef9686d088a5f4cf0f33..040540e939bc3a0993e7c963b281ad91fbfe1ffc 100644 --- a/go/README_cn.md +++ b/go/README_cn.md @@ -50,6 +50,7 @@ output_data := value.Interface().([][]float32) 运行 ```bash +go mod init github.com/paddlepaddle export LD_LIBRARY_PATH=`pwd`/paddle_c/paddle/lib:$LD_LIBRARY_PATH go run ./demo/mobilenet.go ``` diff --git a/go/demo/mobilenet.go b/go/demo/mobilenet.go index 1b42fe8049a584616da7b4940fd19a89df9bc52b..c1ca2e967f72dc6646a6785d86ba59c709bfe25c 100644 --- a/go/demo/mobilenet.go +++ b/go/demo/mobilenet.go @@ -13,7 +13,7 @@ // limitations under the License. package main -import "../paddle" +import "github.com/paddlepaddle/paddle" import "strings" import "io/ioutil" import "strconv" diff --git a/go/demo/mobilenet_c_exp.cc b/go/demo/mobilenet_c_exp.cc new file mode 100644 index 0000000000000000000000000000000000000000..b4f42dab6790bfb6dd33860a8ada704166bb74ac --- /dev/null +++ b/go/demo/mobilenet_c_exp.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include + +void ReadData(float* data, int size); + +int main(int argc, char* argv[]) { + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, "data/model/__model__", "data/model/__params__"); + PD_ConfigDisableGlogInfo(config); + + PD_Predictor* predictor = PD_PredictorCreate(config); + // config has destroyed in PD_PredictorCreate + config = NULL; + + int input_num = PD_PredictorGetInputNum(predictor); + printf("Input num: %d\n", input_num); + int output_num = PD_PredictorGetOutputNum(predictor); + printf("Output num: %d\n", output_num); + + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor); + PD_Tensor* input_tensor = + PD_PredictorGetInputHandle(predictor, input_names->data[0]); + PD_OneDimArrayCstrDestroy(input_names); + input_names = NULL; + + int32_t shape[] = {1, 3, 300, 300}; + float* data = (float*)malloc(sizeof(float) * 1 * 3 * 300 * 300); // NOLINT + ReadData(data, 1 * 3 * 300 * 300); // NOLINT + PD_TensorReshape(input_tensor, 4, shape); + PD_TensorCopyFromCpuFloat(input_tensor, data); + free(data); + data = NULL; + PD_PredictorRun(predictor); + + PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor); + PD_Tensor* output_tensor = + PD_PredictorGetOutputHandle(predictor, output_names->data[0]); + PD_OneDimArrayCstrDestroy(output_names); + output_names = nullptr; + + PD_OneDimArrayInt32* out_shape = PD_TensorGetShape(output_tensor); + int32_t size = 1; + for (size_t index = 0; index < out_shape->size; ++index) { + size = size * out_shape->data[index]; + } + PD_OneDimArrayInt32Destroy(out_shape); + out_shape = NULL; + + data = (float*)malloc(sizeof(float) * size); // NOLINT + PD_TensorCopyToCpuFloat(output_tensor, data); + free(data); + data = NULL; + + PD_TensorDestroy(output_tensor); + output_tensor = NULL; + PD_TensorDestroy(input_tensor); + input_tensor = NULL; + PD_PredictorDestroy(predictor); + predictor = NULL; + + return 0; +} + +void ReadData(float* data, int n) { + FILE* fp = fopen("data/data.txt", "r"); + for (int i = 0; i < n; i++) { + fscanf(fp, "%f", &data[i]); + } + fclose(fp); +} diff --git a/go/paddle/common.go b/go/paddle/common.go index 4bf947659312824216e6003cb2f150ae39a94d00..cbbde6a45f59b80931a3a2c501581819085e8ea7 100644 --- a/go/paddle/common.go +++ b/go/paddle/common.go @@ -15,7 +15,7 @@ package paddle // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include -// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c +// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c // #include // #include import "C" diff --git a/go/paddle/config.go b/go/paddle/config.go index 89f7d7e63ff2a858f058ad22ea424b29f66a4477..68a31230997bed73fbab1c1d1c7af123e353cf97 100644 --- a/go/paddle/config.go +++ b/go/paddle/config.go @@ -15,7 +15,7 @@ package paddle // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include -// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c +// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c // #include // #include // #include diff --git a/go/paddle/predictor.go b/go/paddle/predictor.go index 59bad908e6a5082e38b8bb33c849aa1097107d76..5f2b2c81a60549dfdbf22dd31a98560e7e3a8cee 100644 --- a/go/paddle/predictor.go +++ b/go/paddle/predictor.go @@ -15,7 +15,7 @@ package paddle // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include -// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c +// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c // #include // #include "paddle_c_api.h" import "C" @@ -88,7 +88,7 @@ func (predictor *Predictor) GetInputNames() []string { } func (predictor *Predictor) GetOutputNames() []string { - names := make([]string, predictor.GetInputNum()) + names := make([]string, predictor.GetOutputNum()) for i := 0; i < len(names); i++ { names[i] = predictor.GetOutputName(i) } diff --git a/go/paddle/tensor.go b/go/paddle/tensor.go index e6e2c53fef1af565d4efba976d10839efe22517d..6fbcf039f88a7cc43a5d28f0433c9feb965566f0 100644 --- a/go/paddle/tensor.go +++ b/go/paddle/tensor.go @@ -15,7 +15,7 @@ package paddle // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include -// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c +// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c // #include // #include // #include @@ -209,7 +209,7 @@ func DecodeTensor(r *bytes.Reader, shape []int32, t reflect.Type, ptr reflect.Va value := reflect.Indirect(ptr) value.Set(reflect.MakeSlice(t, int(shape[0]), int(shape[0]))) if len(shape) == 1 && value.Len() > 0 { - switch value.Index(1).Kind() { + switch value.Index(0).Kind() { case reflect.Uint8, reflect.Int32, reflect.Int64, reflect.Float32: binary.Read(r, Endian(), value.Interface()) return diff --git a/paddle/extension.h b/paddle/extension.h index 71469576853a33b9158713304a68c6ac757aab4f..98d4bfd0326c5c524fcac9129f58d0ae99fc8afe 100644 --- a/paddle/extension.h +++ b/paddle/extension.h @@ -15,4 +15,4 @@ limitations under the License. */ #pragma once // All paddle apis in C++ frontend -#include "paddle/fluid/extension/include/ext_all.h" +#include "paddle/extension/include/ext_all.h" diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index c18332d3b873164a725a25316fc611aa7e7a3092..dcff02a662e2734bc66d4cf219fce527fd0961aa 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -9,4 +9,3 @@ add_subdirectory(pybind) # NOTE: please add subdirectory inference at last. add_subdirectory(inference) -add_subdirectory(train) diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt index 5a2d7a06201ba4acff679ffcfee87fde8d025ed6..905347d031b35b39b43879c7bd78ab39e933a5b3 100644 --- a/paddle/fluid/distributed/CMakeLists.txt +++ b/paddle/fluid/distributed/CMakeLists.txt @@ -11,9 +11,10 @@ if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") endif() -add_subdirectory(table) add_subdirectory(service) +add_subdirectory(table) add_subdirectory(test) +add_subdirectory(index_dataset) get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) diff --git a/paddle/fluid/distributed/fleet.cc b/paddle/fluid/distributed/fleet.cc index b638af49730dd4800109729c9d91afa82efa80e4..dfd55f16e1a065e46b2186a6a589eabc1ac3b431 100644 --- a/paddle/fluid/distributed/fleet.cc +++ b/paddle/fluid/distributed/fleet.cc @@ -177,8 +177,11 @@ std::future FleetWrapper::PullSparseVarsAsync( for (auto& t : *fea_values) { pull_result_ptr.push_back(t.data()); } - return pserver_ptr_->_worker_ptr->pull_sparse( - pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size()); + + bool training = true; + return pserver_ptr_->_worker_ptr->pull_sparse(pull_result_ptr.data(), + table_id, fea_keys->data(), + fea_keys->size(), training); } void FleetWrapper::PullSparseVarsSync( @@ -224,8 +227,10 @@ void FleetWrapper::PullSparseVarsSync( for (auto& t : *fea_values) { pull_result_ptr.push_back(t.data()); } + bool training = true; auto status = pserver_ptr_->_worker_ptr->pull_sparse( - pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size()); + pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size(), + training); pull_sparse_status.push_back(std::move(status)); for (auto& t : pull_sparse_status) { t.wait(); @@ -238,9 +243,13 @@ void FleetWrapper::PullSparseVarsSync( } } +// is_training is true means training, false means inference, the behavior is +// different on pserver + void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim, uint64_t padding_id, platform::Place place, + bool is_training, std::vector* inputs, std::vector* outputs) { std::vector fea_keys; @@ -279,7 +288,8 @@ void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim, } auto* communicator = Communicator::GetInstance(); auto status = communicator->_worker_ptr->pull_sparse( - pull_result_ptr.data(), table_id, fea_keys.data(), fea_keys.size()); + pull_result_ptr.data(), table_id, fea_keys.data(), fea_keys.size(), + is_training); status.wait(); auto ret = status.get(); if (ret != 0) { diff --git a/paddle/fluid/distributed/fleet.h b/paddle/fluid/distributed/fleet.h index ac566606ddcb4024eeaf7b846c894f7f5cdafa82..0da5d1e2bf987f38de3b9a03c659fc5e1841eca1 100644 --- a/paddle/fluid/distributed/fleet.h +++ b/paddle/fluid/distributed/fleet.h @@ -95,8 +95,12 @@ class FleetWrapper { // Pull sparse variables from server in sync mode // pull immediately to tensors + // is_training is true means training, false means inference, the behavior is + // different on pserver + void PullSparseToTensorSync(const uint64_t table_id, int fea_dim, uint64_t padding_id, platform::Place place, + bool is_training, std::vector* inputs, // NOLINT std::vector* outputs); // NOLINT diff --git a/paddle/fluid/distributed/index_dataset/CMakeLists.txt b/paddle/fluid/distributed/index_dataset/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..a30488494a52bcfea61476caeb1ab08e3e6781a1 --- /dev/null +++ b/paddle/fluid/distributed/index_dataset/CMakeLists.txt @@ -0,0 +1,7 @@ +proto_library(index_dataset_proto SRCS index_dataset.proto) +cc_library(index_wrapper SRCS index_wrapper.cc DEPS index_dataset_proto fs) +cc_library(index_sampler SRCS index_sampler.cc DEPS index_wrapper) + +if(WITH_PYTHON) + py_proto_compile(index_dataset_py_proto SRCS index_dataset.proto) +endif() diff --git a/paddle/fluid/distributed/index_dataset/index_dataset.proto b/paddle/fluid/distributed/index_dataset/index_dataset.proto new file mode 100644 index 0000000000000000000000000000000000000000..1b4ee313671ad503b9e46dbe9e34d4a69d0cfc4d --- /dev/null +++ b/paddle/fluid/distributed/index_dataset/index_dataset.proto @@ -0,0 +1,32 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; +package paddle.distributed; + +message IndexNode { + required uint64 id = 1; + required bool is_leaf = 2; + required float probability = 3; +} + +message TreeMeta { + required int32 height = 1; + required int32 branch = 2; +} + +message KVItem { + required bytes key = 1; + required bytes value = 2; +} \ No newline at end of file diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.cc b/paddle/fluid/distributed/index_dataset/index_sampler.cc new file mode 100644 index 0000000000000000000000000000000000000000..3e573bbdd2de97130a109ddb583a724cf363c6be --- /dev/null +++ b/paddle/fluid/distributed/index_dataset/index_sampler.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/index_dataset/index_sampler.h" + +namespace paddle { +namespace distributed { + +std::vector> LayerWiseSampler::sample( + const std::vector>& user_inputs, + const std::vector& target_ids, bool with_hierarchy) { + auto input_num = target_ids.size(); + auto user_feature_num = user_inputs[0].size(); + std::vector> outputs( + input_num * layer_counts_sum_, + std::vector(user_feature_num + 2)); + + auto max_layer = tree_->Height(); + size_t idx = 0; + for (size_t i = 0; i < input_num; i++) { + auto travel_codes = + tree_->GetTravelCodes(target_ids[i], start_sample_layer_); + auto travel_path = tree_->GetNodes(travel_codes); + for (size_t j = 0; j < travel_path.size(); j++) { + // user + if (j > 0 && with_hierarchy) { + auto ancestor_codes = + tree_->GetAncestorCodes(user_inputs[i], max_layer - j - 1); + auto hierarchical_user = tree_->GetNodes(ancestor_codes); + for (int idx_offset = 0; idx_offset <= layer_counts_[j]; idx_offset++) { + for (size_t k = 0; k < user_feature_num; k++) { + outputs[idx + idx_offset][k] = hierarchical_user[k].id(); + } + } + } else { + for (int idx_offset = 0; idx_offset <= layer_counts_[j]; idx_offset++) { + for (size_t k = 0; k < user_feature_num; k++) { + outputs[idx + idx_offset][k] = user_inputs[i][k]; + } + } + } + + // sampler ++ + outputs[idx][user_feature_num] = travel_path[j].id(); + outputs[idx][user_feature_num + 1] = 1.0; + idx += 1; + for (int idx_offset = 0; idx_offset < layer_counts_[j]; idx_offset++) { + int sample_res = 0; + do { + sample_res = sampler_vec_[j]->Sample(); + } while (layer_ids_[j][sample_res].id() == travel_path[j].id()); + outputs[idx + idx_offset][user_feature_num] = + layer_ids_[j][sample_res].id(); + outputs[idx + idx_offset][user_feature_num + 1] = 0; + } + idx += layer_counts_[j]; + } + } + return outputs; +} + +} // end namespace distributed +} // end namespace paddle diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.h b/paddle/fluid/distributed/index_dataset/index_sampler.h new file mode 100644 index 0000000000000000000000000000000000000000..8813421446a21c1379ca872952fe8b367d0724ca --- /dev/null +++ b/paddle/fluid/distributed/index_dataset/index_sampler.h @@ -0,0 +1,120 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/distributed/index_dataset/index_wrapper.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/math/sampler.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { + +class IndexSampler { + public: + virtual ~IndexSampler() {} + IndexSampler() {} + + template + static std::shared_ptr Init(const std::string& name) { + std::shared_ptr instance = nullptr; + instance.reset(new T(name)); + return instance; + } + + virtual void init_layerwise_conf(const std::vector& layer_sample_counts, + int start_sample_layer = 1, int seed = 0) {} + virtual void init_beamsearch_conf(const int64_t k) {} + virtual std::vector> sample( + const std::vector>& user_inputs, + const std::vector& input_targets, + bool with_hierarchy = false) = 0; +}; + +class LayerWiseSampler : public IndexSampler { + public: + virtual ~LayerWiseSampler() {} + explicit LayerWiseSampler(const std::string& name) { + tree_ = IndexWrapper::GetInstance()->get_tree_index(name); + } + + void init_layerwise_conf(const std::vector& layer_sample_counts, + int start_sample_layer, int seed) override { + seed_ = seed; + start_sample_layer_ = start_sample_layer; + + PADDLE_ENFORCE_GT( + start_sample_layer_, 0, + paddle::platform::errors::InvalidArgument( + "start sampler layer = [%d], it should greater than 0.", + start_sample_layer_)); + PADDLE_ENFORCE_LT(start_sample_layer_, tree_->Height(), + paddle::platform::errors::InvalidArgument( + "start sampler layer = [%d], it should less than " + "max_layer, which is [%d].", + start_sample_layer_, tree_->Height())); + + size_t i = 0; + layer_counts_sum_ = 0; + layer_counts_.clear(); + int cur_layer = start_sample_layer_; + while (cur_layer < tree_->Height()) { + int layer_sample_num = 1; + if (i < layer_sample_counts.size()) { + layer_sample_num = layer_sample_counts[i]; + } + layer_counts_sum_ += layer_sample_num + 1; + layer_counts_.push_back(layer_sample_num); + VLOG(3) << "[INFO] level " << cur_layer + << " sample_layer_counts.push_back: " << layer_sample_num; + cur_layer += 1; + i += 1; + } + reverse(layer_counts_.begin(), layer_counts_.end()); + VLOG(3) << "sample counts sum: " << layer_counts_sum_; + + auto max_layer = tree_->Height(); + sampler_vec_.clear(); + layer_ids_.clear(); + + auto layer_index = max_layer - 1; + size_t idx = 0; + while (layer_index >= start_sample_layer_) { + auto layer_codes = tree_->GetLayerCodes(layer_index); + layer_ids_.push_back(tree_->GetNodes(layer_codes)); + auto sampler_temp = + std::make_shared( + layer_ids_[idx].size() - 1, seed_); + sampler_vec_.push_back(sampler_temp); + layer_index--; + idx++; + } + } + std::vector> sample( + const std::vector>& user_inputs, + const std::vector& target_ids, bool with_hierarchy) override; + + private: + std::vector layer_counts_; + int64_t layer_counts_sum_{0}; + std::shared_ptr tree_{nullptr}; + int seed_{0}; + int start_sample_layer_{1}; + std::vector> sampler_vec_; + std::vector> layer_ids_; +}; + +} // end namespace distributed +} // end namespace paddle diff --git a/paddle/fluid/distributed/index_dataset/index_wrapper.cc b/paddle/fluid/distributed/index_dataset/index_wrapper.cc new file mode 100644 index 0000000000000000000000000000000000000000..99fe4ca0c6d043caef01a867a5acc0d40841ee01 --- /dev/null +++ b/paddle/fluid/distributed/index_dataset/index_wrapper.cc @@ -0,0 +1,196 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/io/fs.h" + +#include +#include +#include "paddle/fluid/distributed/index_dataset/index_wrapper.h" + +namespace paddle { +namespace distributed { + +std::shared_ptr IndexWrapper::s_instance_(nullptr); + +int TreeIndex::Load(const std::string filename) { + int err_no; + auto fp = paddle::framework::fs_open_read(filename, &err_no, ""); + PADDLE_ENFORCE_NE( + fp, nullptr, + platform::errors::InvalidArgument( + "Open file %s failed. Please check whether the file exists.", + filename)); + + int num = 0; + max_id_ = 0; + fake_node_.set_id(0); + fake_node_.set_is_leaf(false); + fake_node_.set_probability(0.0); + max_code_ = 0; + size_t ret = fread(&num, sizeof(num), 1, fp.get()); + while (ret == 1 && num > 0) { + std::string content(num, '\0'); + size_t read_num = + fread(const_cast(content.data()), 1, num, fp.get()); + PADDLE_ENFORCE_EQ( + read_num, static_cast(num), + platform::errors::InvalidArgument( + "Read from file: %s failed. Valid Format is " + "an integer representing the length of the following string, " + "and the string itself.We got an iteger[% d], " + "but the following string's length is [%d].", + filename, num, read_num)); + + KVItem item; + PADDLE_ENFORCE_EQ( + item.ParseFromString(content), true, + platform::errors::InvalidArgument("Parse from file: %s failed. It's " + "content can't be parsed by KVItem.", + filename)); + + if (item.key() == ".tree_meta") { + meta_.ParseFromString(item.value()); + } else { + auto code = boost::lexical_cast(item.key()); + IndexNode node; + node.ParseFromString(item.value()); + PADDLE_ENFORCE_NE(node.id(), 0, + platform::errors::InvalidArgument( + "Node'id should not be equel to zero.")); + if (node.is_leaf()) { + id_codes_map_[node.id()] = code; + } + data_[code] = node; + if (node.id() > max_id_) { + max_id_ = node.id(); + } + if (code > max_code_) { + max_code_ = code; + } + } + ret = fread(&num, sizeof(num), 1, fp.get()); + } + total_nodes_num_ = data_.size(); + max_code_ += 1; + return 0; +} + +std::vector TreeIndex::GetNodes(const std::vector& codes) { + std::vector nodes; + nodes.reserve(codes.size()); + for (size_t i = 0; i < codes.size(); i++) { + if (CheckIsValid(codes[i])) { + nodes.push_back(data_.at(codes[i])); + } else { + nodes.push_back(fake_node_); + } + } + return nodes; +} + +std::vector TreeIndex::GetLayerCodes(int level) { + uint64_t level_num = static_cast(std::pow(meta_.branch(), level)); + uint64_t level_offset = level_num - 1; + + std::vector res; + res.reserve(level_num); + for (uint64_t i = 0; i < level_num; i++) { + auto code = level_offset + i; + if (CheckIsValid(code)) { + res.push_back(code); + } + } + return res; +} + +std::vector TreeIndex::GetAncestorCodes( + const std::vector& ids, int level) { + std::vector res; + res.reserve(ids.size()); + + int cur_level; + for (size_t i = 0; i < ids.size(); i++) { + if (id_codes_map_.find(ids[i]) == id_codes_map_.end()) { + res.push_back(max_code_); + } else { + auto code = id_codes_map_.at(ids[i]); + cur_level = meta_.height() - 1; + + while (level >= 0 && cur_level > level) { + code = (code - 1) / meta_.branch(); + cur_level--; + } + res.push_back(code); + } + } + return res; +} + +std::vector TreeIndex::GetChildrenCodes(uint64_t ancestor, + int level) { + auto level_code_num = static_cast(std::pow(meta_.branch(), level)); + auto code_min = level_code_num - 1; + auto code_max = meta_.branch() * level_code_num - 1; + + std::vector parent; + parent.push_back(ancestor); + std::vector res; + size_t p_idx = 0; + while (true) { + size_t p_size = parent.size(); + for (; p_idx < p_size; p_idx++) { + for (int i = 0; i < meta_.branch(); i++) { + auto code = parent[p_idx] * meta_.branch() + i + 1; + if (data_.find(code) != data_.end()) parent.push_back(code); + } + } + if ((code_min <= parent[p_idx]) && (parent[p_idx] < code_max)) { + break; + } + } + + return std::vector(parent.begin() + p_idx, parent.end()); +} + +std::vector TreeIndex::GetTravelCodes(uint64_t id, int start_level) { + std::vector res; + PADDLE_ENFORCE_NE(id_codes_map_.find(id), id_codes_map_.end(), + paddle::platform::errors::InvalidArgument( + "id = %d doesn't exist in Tree.", id)); + auto code = id_codes_map_.at(id); + int level = meta_.height() - 1; + + while (level >= start_level) { + res.push_back(code); + code = (code - 1) / meta_.branch(); + level--; + } + return res; +} + +std::vector TreeIndex::GetAllLeafs() { + std::vector res; + res.reserve(id_codes_map_.size()); + for (auto& ite : id_codes_map_) { + auto code = ite.second; + res.push_back(data_.at(code)); + } + return res; +} + +} // end namespace distributed +} // end namespace paddle diff --git a/paddle/fluid/distributed/index_dataset/index_wrapper.h b/paddle/fluid/distributed/index_dataset/index_wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..8fb8faf6c84a2d9e1a5e80179a113b8d1ef312c8 --- /dev/null +++ b/paddle/fluid/distributed/index_dataset/index_wrapper.h @@ -0,0 +1,120 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/distributed/index_dataset/index_dataset.pb.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { + +class Index { + public: + Index() {} + ~Index() {} +}; + +class TreeIndex : public Index { + public: + TreeIndex() {} + ~TreeIndex() {} + + int Height() { return meta_.height(); } + int Branch() { return meta_.branch(); } + uint64_t TotalNodeNums() { return total_nodes_num_; } + uint64_t EmbSize() { return max_id_ + 1; } + int Load(const std::string path); + + inline bool CheckIsValid(int code) { + if (data_.find(code) != data_.end()) { + return true; + } else { + return false; + } + } + + std::vector GetNodes(const std::vector& codes); + std::vector GetLayerCodes(int level); + std::vector GetAncestorCodes(const std::vector& ids, + int level); + std::vector GetChildrenCodes(uint64_t ancestor, int level); + std::vector GetTravelCodes(uint64_t id, int start_level); + std::vector GetAllLeafs(); + + std::unordered_map data_; + std::unordered_map id_codes_map_; + uint64_t total_nodes_num_; + TreeMeta meta_; + uint64_t max_id_; + uint64_t max_code_; + IndexNode fake_node_; +}; + +using TreePtr = std::shared_ptr; + +class IndexWrapper { + public: + virtual ~IndexWrapper() {} + IndexWrapper() {} + + void clear_tree() { tree_map.clear(); } + + TreePtr get_tree_index(const std::string name) { + PADDLE_ENFORCE_NE(tree_map.find(name), tree_map.end(), + paddle::platform::errors::InvalidArgument( + "tree [%s] doesn't exist. Please insert it firstly " + "by API[\' insert_tree_index \'].", + name)); + return tree_map[name]; + } + + void insert_tree_index(const std::string name, const std::string tree_path) { + if (tree_map.find(name) != tree_map.end()) { + VLOG(0) << "Tree " << name << " has already existed."; + return; + } + TreePtr tree = std::make_shared(); + int ret = tree->Load(tree_path); + PADDLE_ENFORCE_EQ(ret, 0, paddle::platform::errors::InvalidArgument( + "Load tree[%s] from path[%s] failed. Please " + "check whether the file exists.", + name, tree_path)); + tree_map.insert(std::pair{name, tree}); + } + + static std::shared_ptr GetInstancePtr() { + if (NULL == s_instance_) { + s_instance_.reset(new paddle::distributed::IndexWrapper()); + } + return s_instance_; + } + + static IndexWrapper* GetInstance() { + if (NULL == s_instance_) { + s_instance_.reset(new paddle::distributed::IndexWrapper()); + } + return s_instance_.get(); + } + + private: + static std::shared_ptr s_instance_; + std::unordered_map tree_map; +}; + +} // end namespace distributed +} // end namespace paddle diff --git a/paddle/fluid/distributed/service/CMakeLists.txt b/paddle/fluid/distributed/service/CMakeLists.txt index bb3f6f1174da9d49a8407ec8db16a5a2aa2a8336..d1f04e26ade7289bcb10988d02de01962a1889ab 100644 --- a/paddle/fluid/distributed/service/CMakeLists.txt +++ b/paddle/fluid/distributed/service/CMakeLists.txt @@ -16,6 +16,7 @@ set_source_files_properties(communicator.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUT set_source_files_properties(service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(brpc_ps_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(brpc_ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties(ps_local_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(brpc_utils.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(heter_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) @@ -24,11 +25,13 @@ set_source_files_properties(heter_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUT set_source_files_properties(client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - +set_source_files_properties(graph_brpc_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties(graph_brpc_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_library(brpc_utils SRCS brpc_utils.cc DEPS tensor device_context ${COMMON_DEPS} ${RPC_DEPS}) -cc_library(downpour_server SRCS brpc_ps_server.cc DEPS boost eigen3 table brpc_utils ${RPC_DEPS}) -cc_library(downpour_client SRCS brpc_ps_client.cc DEPS boost eigen3 table brpc_utils ${RPC_DEPS}) +cc_library(downpour_server SRCS graph_brpc_server.cc brpc_ps_server.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS}) +cc_library(downpour_client SRCS graph_brpc_client.cc brpc_ps_client.cc +ps_local_client.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS}) cc_library(client SRCS ps_client.cc DEPS downpour_client boost ${RPC_DEPS}) cc_library(server SRCS server.cc DEPS downpour_server boost ${RPC_DEPS}) @@ -38,3 +41,6 @@ cc_library(ps_service SRCS service.cc DEPS communicator client server boost ${RP cc_library(heter_server SRCS heter_server.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS}) cc_library(heter_client SRCS heter_client.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS}) + +set_source_files_properties(graph_py_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library(graph_py_service SRCS graph_py_service.cc DEPS ps_service) diff --git a/paddle/fluid/distributed/service/brpc_ps_client.cc b/paddle/fluid/distributed/service/brpc_ps_client.cc index 163526fe3b28c91f36e2670d1974b520ef3bf66a..a6ad9d08f52fda9bd79b1a1f0eebf1769c855eb3 100644 --- a/paddle/fluid/distributed/service/brpc_ps_client.cc +++ b/paddle/fluid/distributed/service/brpc_ps_client.cc @@ -768,8 +768,8 @@ std::future BrpcPsClient::push_global_step(int table_id, std::future BrpcPsClient::pull_sparse(float **select_values, size_t table_id, - const uint64_t *keys, - size_t num) { + const uint64_t *keys, size_t num, + bool is_training) { size_t request_call_num = _server_channels.size(); auto shard_sorted_kvs = std::make_shared< @@ -837,16 +837,27 @@ std::future BrpcPsClient::pull_sparse(float **select_values, uint32_t kv_request_count = 0; size_t sorted_kv_size = sorted_kvs.size(); auto &request_buffer = closure->cntl(i)->request_attachment(); + + request_buffer.append((void *)&is_training, sizeof(bool)); + std::vector keys_counter; + keys_counter.reserve(sorted_kv_size); + for (size_t kv_idx = 0; kv_idx < sorted_kv_size; ++kv_idx) { ++kv_request_count; + uint32_t keys = 1; last_key = sorted_kvs[kv_idx].first; request_buffer.append((void *)&last_key, sizeof(uint64_t)); while (kv_idx < sorted_kv_size - 1 && last_key == sorted_kvs[kv_idx + 1].first) { ++kv_idx; + ++keys; } + keys_counter.push_back(keys); } + request_buffer.append((void *)keys_counter.data(), + sizeof(uint32_t) * keys_counter.size()); + if (kv_request_count == 0) { closure->Run(); } else { @@ -869,8 +880,8 @@ std::future BrpcPsClient::send_client2client_msg( auto promise = std::make_shared>(); std::future fut = promise->get_future(); if (to_client_id >= _client_channels.size()) { - LOG(FATAL) << "to_client_id is out of range clients, which size is " - << _client_channels.size(); + VLOG(0) << "to_client_id is out of range clients, which size is " + << _client_channels.size(); promise->set_value(-1); return fut; } @@ -956,7 +967,7 @@ int32_t BrpcPsClient::recv_and_save_table(const uint64_t table_id, } auto status = pull_sparse((float **)save_vec.data(), table_id, - save_key.data(), save_key.size()); + save_key.data(), save_key.size(), true); status.wait(); // create lod tensor diff --git a/paddle/fluid/distributed/service/brpc_ps_client.h b/paddle/fluid/distributed/service/brpc_ps_client.h index 8f9d2653864d1c7fd1801632a6c84edb1bc04ccf..5192356e4b5e574de385478c57a7b7cedb49988a 100644 --- a/paddle/fluid/distributed/service/brpc_ps_client.h +++ b/paddle/fluid/distributed/service/brpc_ps_client.h @@ -148,7 +148,8 @@ class BrpcPsClient : public PSClient { virtual std::future pull_sparse(float **select_values, size_t table_id, - const uint64_t *keys, size_t num); + const uint64_t *keys, size_t num, + bool is_training); virtual std::future print_table_stat(uint32_t table_id); @@ -170,9 +171,22 @@ class BrpcPsClient : public PSClient { virtual int32_t recv_and_save_table(const uint64_t table_id, const std::string &path); - private: + protected: + virtual size_t get_server_nums() { return _server_channels.size(); } + inline brpc::Channel *get_sparse_channel(size_t server_id) { + return _server_channels[server_id][0].get(); + } + inline brpc::Channel *get_dense_channel(size_t server_id) { + return _server_channels[server_id][1].get(); + } + inline brpc::Channel *get_cmd_channel(size_t server_id) { + return _server_channels[server_id][2].get(); + } virtual int32_t initialize() override; + private: + // virtual int32_t initialize() override; + inline uint32_t dense_dim_per_shard(uint32_t dense_dim_total, uint32_t shard_num) { return dense_dim_total / shard_num + 1; @@ -184,16 +198,6 @@ class BrpcPsClient : public PSClient { std::future send_save_cmd(uint32_t table_id, int cmd_id, const std::vector ¶m); - inline brpc::Channel *get_sparse_channel(size_t server_id) { - return _server_channels[server_id][0].get(); - } - inline brpc::Channel *get_dense_channel(size_t server_id) { - return _server_channels[server_id][1].get(); - } - inline brpc::Channel *get_cmd_channel(size_t server_id) { - return _server_channels[server_id][2].get(); - } - bool _running = false; bool _flushing = false; std::atomic _async_call_num; //异步请求计数 @@ -220,8 +224,6 @@ class BrpcPsClient : public PSClient { size_t num, void *done) override; - virtual size_t get_server_nums() { return _server_channels.size(); } - private: int32_t start_client_service(); diff --git a/paddle/fluid/distributed/service/brpc_ps_server.cc b/paddle/fluid/distributed/service/brpc_ps_server.cc index 8400e669182d670b892dc2eb55492a92ee919ae5..a1440260bf2e77093bb937e62b13b54ad06a3e64 100644 --- a/paddle/fluid/distributed/service/brpc_ps_server.cc +++ b/paddle/fluid/distributed/service/brpc_ps_server.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/distributed/service/brpc_ps_server.h" #include // NOLINT +#include "butil/object_pool.h" +#include "paddle/fluid/distributed/table/depends/sparse_utils.h" #include "paddle/fluid/distributed/table/table.h" #include "paddle/fluid/framework/archive.h" #include "paddle/fluid/platform/profiler.h" @@ -60,7 +62,8 @@ uint64_t BrpcPsServer::start(const std::string &ip, uint32_t port) { std::unique_lock lock(mutex_); std::string ip_port = ip + ":" + std::to_string(port); - VLOG(3) << "server of rank " << _rank << " starts at " << ip_port; + VLOG(0) << "running server with rank id: " << _rank + << ", endpoint: " << ip_port; brpc::ServerOptions options; int num_threads = std::thread::hardware_concurrency(); @@ -194,12 +197,13 @@ int32_t BrpcPsService::pull_dense(Table *table, const PsRequestMessage &request, return 0; } - std::vector res_data; - res_data.resize(num * table->value_accesor()->select_size() / sizeof(float)); - table->pull_dense(res_data.data(), num); + auto res_data = butil::get_object>(); + res_data->resize(num * table->value_accesor()->select_size() / sizeof(float)); + table->pull_dense(res_data->data(), num); - cntl->response_attachment().append((char *)res_data.data(), - res_data.size() * sizeof(float)); + cntl->response_attachment().append((char *)(res_data->data()), + res_data->size() * sizeof(float)); + butil::return_object(res_data); return 0; } @@ -336,35 +340,42 @@ int32_t BrpcPsService::pull_sparse(Table *table, brpc::Controller *cntl) { platform::RecordEvent record_event("PsService->pull_sparse"); CHECK_TABLE_EXIST(table, request, response) - thread_local std::string push_sparse_request_buffer; + auto &req_io_buffer = cntl->request_attachment(); auto req_buffer_size = req_io_buffer.size(); + if (req_buffer_size < 1) { set_response_code(response, -1, "req attachment is empty"); return 0; } + if (request.params_size() < 1) { set_response_code(response, -1, "PsRequestMessage.params is requeired at " "least 1 for num of sparse_key"); return 0; } + uint32_t num = *(uint32_t *)(request.params(0).c_str()); - push_sparse_request_buffer.resize(0); - push_sparse_request_buffer.reserve(req_buffer_size); - const char *data = (const char *)cntl->request_attachment().fetch( - const_cast(push_sparse_request_buffer.data()), req_buffer_size); - /* - Attachment Content: - |---keysData---| - |---8*{num}B---| - */ - const uint64_t *keys = (const uint64_t *)data; - std::vector res_data; - res_data.resize(num * table->value_accesor()->select_size() / sizeof(float)); - table->pull_sparse(res_data.data(), keys, num); - cntl->response_attachment().append((char *)res_data.data(), - res_data.size() * sizeof(float)); + auto dim = table->value_accesor()->select_dim(); + + thread_local std::string req_buffer; + req_buffer.reserve(req_buffer_size); + + const void *data = cntl->request_attachment().fetch( + const_cast(req_buffer.data()), req_buffer_size); + + auto value = PullSparseValue(num, dim); + + value.DeserializeFromBytes(const_cast(data)); + + auto res_data = butil::get_object>(); + res_data->resize(num * dim); + table->pull_sparse(res_data->data(), value); + + cntl->response_attachment().append((char *)(res_data->data()), + res_data->size() * sizeof(float)); + butil::return_object(res_data); return 0; } @@ -538,7 +549,7 @@ int32_t BrpcPsService::stop_server(Table *table, auto *p_server = _server; std::thread t_stop([p_server]() { p_server->stop(); - LOG(INFO) << "Server Stoped"; + VLOG(3) << "Server Stoped"; }); t_stop.detach(); return 0; diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc index 096718768149c574fd57b91396879d7bec5d37e0..a356b77e73733ed9b657a7603adf57c5228bf3c5 100644 --- a/paddle/fluid/distributed/service/brpc_utils.cc +++ b/paddle/fluid/distributed/service/brpc_utils.cc @@ -324,7 +324,7 @@ std::string GetIntTypeEndpoint(const std::string& ip, const uint32_t& port) { while (hp->h_addr_list[i] != NULL) { int_ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]); - VLOG(0) << "Brpc Get host by name, host:" << ip << " -> ip: " << int_ip; + VLOG(3) << "Brpc Get host by name, host:" << ip << " -> ip: " << int_ip; break; } diff --git a/paddle/fluid/distributed/service/communicator.cc b/paddle/fluid/distributed/service/communicator.cc index 8699719e5cdcc8f40cf26fc90c17ad52849804d3..3d5ab8e16d90202d2365c14f764f5e0f53929b68 100644 --- a/paddle/fluid/distributed/service/communicator.cc +++ b/paddle/fluid/distributed/service/communicator.cc @@ -320,9 +320,11 @@ void Communicator::RpcRecvSparse(const std::string &varname, int table_id, push_g_vec.push_back(tensor->data() + i * dim); } + bool training = true; + auto status = _worker_ptr->pull_sparse( (float **)push_g_vec.data(), table_id, // NOLINT - sparse_push_keys.data(), sparse_push_keys.size()); + sparse_push_keys.data(), sparse_push_keys.size(), training); status.wait(); return; } diff --git a/paddle/fluid/distributed/service/communicator.h b/paddle/fluid/distributed/service/communicator.h index 043fe9d83dfc53aaa5d13ef1f12745836129aaa0..fa60cab2b58779ede16cc51971277130bcaca909 100644 --- a/paddle/fluid/distributed/service/communicator.h +++ b/paddle/fluid/distributed/service/communicator.h @@ -310,6 +310,8 @@ class Communicator { return _worker_ptr; } + RecvCtxMap &GetRecvCtxMap() { return recv_varname_to_ctx_; } + std::shared_ptr _worker_ptr; // pointer to worker protected: diff --git a/paddle/fluid/distributed/service/env.h b/paddle/fluid/distributed/service/env.h index 901aba0ad90c49c7403862997830bed7e0950dc0..ca395a776afd4e2ee53e0aeaebb94494d4f4e6a6 100644 --- a/paddle/fluid/distributed/service/env.h +++ b/paddle/fluid/distributed/service/env.h @@ -39,7 +39,7 @@ struct PSHost { // |---ip---|---port---|--rank--| // |-32bit--|--20bit---|--12bit-| - // for pslib + uint64_t serialize_to_uint64() { uint64_t host_label = 0; host_label = inet_addr(ip.c_str()); @@ -175,14 +175,12 @@ class PSEnvironment { host.ip = ip; host.port = port; host.rank = rank; - if (sign_set.count(rank) > 0) { - LOG(WARNING) << "ps-host :" << host.ip << ":" << host.port - << ", rank:" << host.rank - << " already register, ignore register"; - } else { + + if (sign_set.count(rank) == 0) { host_list.push_back(host); sign_set.insert(rank); } + return 0; } diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc new file mode 100644 index 0000000000000000000000000000000000000000..eafb4d596cc1671db26189b84ea9d0c0c31ea398 --- /dev/null +++ b/paddle/fluid/distributed/service/graph_brpc_client.cc @@ -0,0 +1,332 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/service/graph_brpc_client.h" +#include +#include +#include +#include +#include +#include +#include "Eigen/Dense" +#include "paddle/fluid/distributed/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/framework/archive.h" +#include "paddle/fluid/string/string_helper.h" +namespace paddle { +namespace distributed { + +void GraphPsService_Stub::service( + ::google::protobuf::RpcController *controller, + const ::paddle::distributed::PsRequestMessage *request, + ::paddle::distributed::PsResponseMessage *response, + ::google::protobuf::Closure *done) { + if (graph_service != NULL && local_channel == channel()) { + // VLOG(0)<<"use local"; + task_pool->enqueue([this, controller, request, response, done]() -> int { + this->graph_service->service(controller, request, response, done); + return 0; + }); + } else { + // VLOG(0)<<"use server"; + PsService_Stub::service(controller, request, response, done); + } +} + +int GraphBrpcClient::get_server_index_by_id(uint64_t id) { + int shard_num = get_shard_num(); + int shard_per_server = shard_num % server_size == 0 + ? shard_num / server_size + : shard_num / server_size + 1; + return id % shard_num / shard_per_server; +} + +std::future GraphBrpcClient::get_node_feat( + const uint32_t &table_id, const std::vector &node_ids, + const std::vector &feature_names, + std::vector> &res) { + std::vector request2server; + std::vector server2request(server_size, -1); + for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) { + int server_index = get_server_index_by_id(node_ids[query_idx]); + if (server2request[server_index] == -1) { + server2request[server_index] = request2server.size(); + request2server.push_back(server_index); + } + } + size_t request_call_num = request2server.size(); + std::vector> node_id_buckets(request_call_num); + std::vector> query_idx_buckets(request_call_num); + for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) { + int server_index = get_server_index_by_id(node_ids[query_idx]); + int request_idx = server2request[server_index]; + node_id_buckets[request_idx].push_back(node_ids[query_idx]); + query_idx_buckets[request_idx].push_back(query_idx); + } + + DownpourBrpcClosure *closure = new DownpourBrpcClosure( + request_call_num, + [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) { + int ret = 0; + auto *closure = (DownpourBrpcClosure *)done; + int fail_num = 0; + for (int request_idx = 0; request_idx < request_call_num; + ++request_idx) { + if (closure->check_response(request_idx, + PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) { + ++fail_num; + } else { + auto &res_io_buffer = + closure->cntl(request_idx)->response_attachment(); + butil::IOBufBytesIterator io_buffer_itr(res_io_buffer); + size_t bytes_size = io_buffer_itr.bytes_left(); + std::unique_ptr buffer_wrapper(new char[bytes_size]); + char *buffer = buffer_wrapper.get(); + io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size); + + for (size_t feat_idx = 0; feat_idx < feature_names.size(); + ++feat_idx) { + for (size_t node_idx = 0; + node_idx < query_idx_buckets.at(request_idx).size(); + ++node_idx) { + int query_idx = query_idx_buckets.at(request_idx).at(node_idx); + size_t feat_len = *(size_t *)(buffer); + buffer += sizeof(size_t); + auto feature = std::string(buffer, feat_len); + res[feat_idx][query_idx] = feature; + buffer += feat_len; + } + } + } + if (fail_num == request_call_num) { + ret = -1; + } + } + closure->set_promise_value(ret); + }); + + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + + for (int request_idx = 0; request_idx < request_call_num; ++request_idx) { + int server_index = request2server[request_idx]; + closure->request(request_idx)->set_cmd_id(PS_GRAPH_GET_NODE_FEAT); + closure->request(request_idx)->set_table_id(table_id); + closure->request(request_idx)->set_client_id(_client_id); + size_t node_num = node_id_buckets[request_idx].size(); + + closure->request(request_idx) + ->add_params((char *)node_id_buckets[request_idx].data(), + sizeof(uint64_t) * node_num); + std::string joint_feature_name = + paddle::string::join_strings(feature_names, '\t'); + closure->request(request_idx) + ->add_params(joint_feature_name.c_str(), joint_feature_name.size()); + + GraphPsService_Stub rpc_stub = + getServiceStub(get_cmd_channel(server_index)); + closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx), + closure->response(request_idx), closure); + } + + return fut; +} +// char* &buffer,int &actual_size +std::future GraphBrpcClient::batch_sample_neighboors( + uint32_t table_id, std::vector node_ids, int sample_size, + std::vector>> &res) { + std::vector request2server; + std::vector server2request(server_size, -1); + res.clear(); + for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) { + int server_index = get_server_index_by_id(node_ids[query_idx]); + if (server2request[server_index] == -1) { + server2request[server_index] = request2server.size(); + request2server.push_back(server_index); + } + res.push_back(std::vector>()); + } + size_t request_call_num = request2server.size(); + std::vector> node_id_buckets(request_call_num); + std::vector> query_idx_buckets(request_call_num); + for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) { + int server_index = get_server_index_by_id(node_ids[query_idx]); + int request_idx = server2request[server_index]; + node_id_buckets[request_idx].push_back(node_ids[query_idx]); + query_idx_buckets[request_idx].push_back(query_idx); + } + + DownpourBrpcClosure *closure = new DownpourBrpcClosure( + request_call_num, + [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) { + int ret = 0; + auto *closure = (DownpourBrpcClosure *)done; + int fail_num = 0; + for (int request_idx = 0; request_idx < request_call_num; + ++request_idx) { + if (closure->check_response(request_idx, + PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) { + ++fail_num; + } else { + auto &res_io_buffer = + closure->cntl(request_idx)->response_attachment(); + butil::IOBufBytesIterator io_buffer_itr(res_io_buffer); + size_t bytes_size = io_buffer_itr.bytes_left(); + std::unique_ptr buffer_wrapper(new char[bytes_size]); + char *buffer = buffer_wrapper.get(); + io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size); + + size_t node_num = *(size_t *)buffer; + int *actual_sizes = (int *)(buffer + sizeof(size_t)); + char *node_buffer = + buffer + sizeof(size_t) + sizeof(int) * node_num; + + int offset = 0; + for (size_t node_idx = 0; node_idx < node_num; ++node_idx) { + int query_idx = query_idx_buckets.at(request_idx).at(node_idx); + int actual_size = actual_sizes[node_idx]; + int start = 0; + while (start < actual_size) { + res[query_idx].push_back( + {*(uint64_t *)(node_buffer + offset + start), + *(float *)(node_buffer + offset + start + + GraphNode::id_size)}); + start += GraphNode::id_size + GraphNode::weight_size; + } + offset += actual_size; + } + } + if (fail_num == request_call_num) { + ret = -1; + } + } + closure->set_promise_value(ret); + }); + + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + + for (int request_idx = 0; request_idx < request_call_num; ++request_idx) { + int server_index = request2server[request_idx]; + closure->request(request_idx)->set_cmd_id(PS_GRAPH_SAMPLE_NEIGHBOORS); + closure->request(request_idx)->set_table_id(table_id); + closure->request(request_idx)->set_client_id(_client_id); + size_t node_num = node_id_buckets[request_idx].size(); + + closure->request(request_idx) + ->add_params((char *)node_id_buckets[request_idx].data(), + sizeof(uint64_t) * node_num); + closure->request(request_idx) + ->add_params((char *)&sample_size, sizeof(int)); + // PsService_Stub rpc_stub(get_cmd_channel(server_index)); + GraphPsService_Stub rpc_stub = + getServiceStub(get_cmd_channel(server_index)); + closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx), + closure->response(request_idx), closure); + } + + return fut; +} +std::future GraphBrpcClient::random_sample_nodes( + uint32_t table_id, int server_index, int sample_size, + std::vector &ids) { + DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) { + int ret = 0; + auto *closure = (DownpourBrpcClosure *)done; + if (closure->check_response(0, PS_GRAPH_SAMPLE_NODES) != 0) { + ret = -1; + } else { + auto &res_io_buffer = closure->cntl(0)->response_attachment(); + butil::IOBufBytesIterator io_buffer_itr(res_io_buffer); + size_t bytes_size = io_buffer_itr.bytes_left(); + char buffer[bytes_size]; + auto size = io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size); + int index = 0; + while (index < bytes_size) { + ids.push_back(*(uint64_t *)(buffer + index)); + index += GraphNode::id_size; + } + } + closure->set_promise_value(ret); + }); + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + ; + closure->request(0)->set_cmd_id(PS_GRAPH_SAMPLE_NODES); + closure->request(0)->set_table_id(table_id); + closure->request(0)->set_client_id(_client_id); + closure->request(0)->add_params((char *)&sample_size, sizeof(int)); + ; + // PsService_Stub rpc_stub(get_cmd_channel(server_index)); + GraphPsService_Stub rpc_stub = getServiceStub(get_cmd_channel(server_index)); + closure->cntl(0)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(0), closure->request(0), closure->response(0), + closure); + return fut; +} +std::future GraphBrpcClient::pull_graph_list( + uint32_t table_id, int server_index, int start, int size, int step, + std::vector &res) { + DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) { + int ret = 0; + auto *closure = (DownpourBrpcClosure *)done; + if (closure->check_response(0, PS_PULL_GRAPH_LIST) != 0) { + ret = -1; + } else { + auto &res_io_buffer = closure->cntl(0)->response_attachment(); + butil::IOBufBytesIterator io_buffer_itr(res_io_buffer); + size_t bytes_size = io_buffer_itr.bytes_left(); + char buffer[bytes_size]; + io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size); + int index = 0; + while (index < bytes_size) { + FeatureNode node; + node.recover_from_buffer(buffer + index); + index += node.get_size(false); + res.push_back(node); + } + } + closure->set_promise_value(ret); + }); + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + closure->request(0)->set_cmd_id(PS_PULL_GRAPH_LIST); + closure->request(0)->set_table_id(table_id); + closure->request(0)->set_client_id(_client_id); + closure->request(0)->add_params((char *)&start, sizeof(int)); + closure->request(0)->add_params((char *)&size, sizeof(int)); + closure->request(0)->add_params((char *)&step, sizeof(int)); + // PsService_Stub rpc_stub(get_cmd_channel(server_index)); + GraphPsService_Stub rpc_stub = getServiceStub(get_cmd_channel(server_index)); + closure->cntl(0)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(0), closure->request(0), closure->response(0), + closure); + return fut; +} +int32_t GraphBrpcClient::initialize() { + // set_shard_num(_config.shard_num()); + BrpcPsClient::initialize(); + server_size = get_server_nums(); + graph_service = NULL; + local_channel = NULL; + return 0; +} +} +} diff --git a/paddle/fluid/distributed/service/graph_brpc_client.h b/paddle/fluid/distributed/service/graph_brpc_client.h new file mode 100644 index 0000000000000000000000000000000000000000..4e6775a4bedaf1a4028fe483f58be818ef1e3581 --- /dev/null +++ b/paddle/fluid/distributed/service/graph_brpc_client.h @@ -0,0 +1,105 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include +#include "ThreadPool.h" +#include "brpc/channel.h" +#include "brpc/controller.h" +#include "brpc/server.h" +#include "paddle/fluid/distributed/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/service/ps_client.h" +#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor_util.h" + +namespace paddle { +namespace distributed { + +class GraphPsService_Stub : public PsService_Stub { + public: + GraphPsService_Stub(::google::protobuf::RpcChannel* channel, + ::google::protobuf::RpcChannel* local_channel = NULL, + GraphBrpcService* service = NULL, int thread_num = 1) + : PsService_Stub(channel) { + this->local_channel = local_channel; + this->graph_service = service; + task_pool.reset(new ::ThreadPool(thread_num)); + } + virtual ~GraphPsService_Stub() {} + + // implements PsService ------------------------------------------ + GraphBrpcService* graph_service; + std::shared_ptr<::ThreadPool> task_pool; + ::google::protobuf::RpcChannel* local_channel; + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(GraphPsService_Stub); + void service(::google::protobuf::RpcController* controller, + const ::paddle::distributed::PsRequestMessage* request, + ::paddle::distributed::PsResponseMessage* response, + ::google::protobuf::Closure* done); +}; +class GraphBrpcClient : public BrpcPsClient { + public: + GraphBrpcClient() {} + virtual ~GraphBrpcClient() {} + // given a batch of nodes, sample graph_neighboors for each of them + virtual std::future batch_sample_neighboors( + uint32_t table_id, std::vector node_ids, int sample_size, + std::vector>>& res); + + virtual std::future pull_graph_list(uint32_t table_id, + int server_index, int start, + int size, int step, + std::vector& res); + virtual std::future random_sample_nodes(uint32_t table_id, + int server_index, + int sample_size, + std::vector& ids); + virtual std::future get_node_feat( + const uint32_t& table_id, const std::vector& node_ids, + const std::vector& feature_names, + std::vector>& res); + virtual int32_t initialize(); + int get_shard_num() { return shard_num; } + void set_shard_num(int shard_num) { this->shard_num = shard_num; } + int get_server_index_by_id(uint64_t id); + void set_local_channel(int index) { + this->local_channel = get_cmd_channel(index); + } + void set_local_graph_service(GraphBrpcService* graph_service) { + this->graph_service = graph_service; + } + GraphPsService_Stub getServiceStub(::google::protobuf::RpcChannel* channel, + int thread_num = 1) { + return GraphPsService_Stub(channel, local_channel, graph_service, + thread_num); + } + + private: + int shard_num; + size_t server_size; + ::google::protobuf::RpcChannel* local_channel; + GraphBrpcService* graph_service; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/service/graph_brpc_server.cc b/paddle/fluid/distributed/service/graph_brpc_server.cc new file mode 100644 index 0000000000000000000000000000000000000000..bdd926278b624b9e9bfdf19a4f293784bef6e28f --- /dev/null +++ b/paddle/fluid/distributed/service/graph_brpc_server.cc @@ -0,0 +1,348 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/service/brpc_ps_server.h" + +#include // NOLINT +#include "butil/endpoint.h" +#include "iomanip" +#include "paddle/fluid/distributed/service/brpc_ps_client.h" +#include "paddle/fluid/framework/archive.h" +#include "paddle/fluid/platform/profiler.h" +namespace paddle { +namespace distributed { + +int32_t GraphBrpcServer::initialize() { + auto &service_config = _config.downpour_server_param().service_param(); + if (!service_config.has_service_class()) { + LOG(ERROR) << "miss service_class in ServerServiceParameter"; + return -1; + } + auto *service = + CREATE_PSCORE_CLASS(PsBaseService, service_config.service_class()); + if (service == NULL) { + LOG(ERROR) << "service is unregistered, service_name:" + << service_config.service_class(); + return -1; + } + + _service.reset(service); + if (service->configure(this) != 0 || service->initialize() != 0) { + LOG(ERROR) << "service initialize failed, service_name:" + << service_config.service_class(); + return -1; + } + if (_server.AddService(service, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) { + LOG(ERROR) << "service add to brpc failed, service:" + << service_config.service_class(); + return -1; + } + return 0; +} + +uint64_t GraphBrpcServer::start(const std::string &ip, uint32_t port) { + std::unique_lock lock(mutex_); + + std::string ip_port = ip + ":" + std::to_string(port); + VLOG(3) << "server of rank " << _rank << " starts at " << ip_port; + brpc::ServerOptions options; + + int num_threads = std::thread::hardware_concurrency(); + auto trainers = _environment->get_trainers(); + options.num_threads = trainers > num_threads ? trainers : num_threads; + + if (_server.Start(ip_port.c_str(), &options) != 0) { + LOG(ERROR) << "GraphBrpcServer start failed, ip_port=" << ip_port; + return 0; + } + _environment->registe_ps_server(ip, port, _rank); + return 0; +} + +int32_t GraphBrpcServer::port() { return _server.listen_address().port; } + +int32_t GraphBrpcService::initialize() { + _is_initialize_shard_info = false; + _service_handler_map[PS_STOP_SERVER] = &GraphBrpcService::stop_server; + _service_handler_map[PS_LOAD_ONE_TABLE] = &GraphBrpcService::load_one_table; + _service_handler_map[PS_LOAD_ALL_TABLE] = &GraphBrpcService::load_all_table; + + _service_handler_map[PS_PRINT_TABLE_STAT] = + &GraphBrpcService::print_table_stat; + _service_handler_map[PS_BARRIER] = &GraphBrpcService::barrier; + _service_handler_map[PS_START_PROFILER] = &GraphBrpcService::start_profiler; + _service_handler_map[PS_STOP_PROFILER] = &GraphBrpcService::stop_profiler; + + _service_handler_map[PS_PULL_GRAPH_LIST] = &GraphBrpcService::pull_graph_list; + _service_handler_map[PS_GRAPH_SAMPLE_NEIGHBOORS] = + &GraphBrpcService::graph_random_sample_neighboors; + _service_handler_map[PS_GRAPH_SAMPLE_NODES] = + &GraphBrpcService::graph_random_sample_nodes; + _service_handler_map[PS_GRAPH_GET_NODE_FEAT] = + &GraphBrpcService::graph_get_node_feat; + + // shard初始化,server启动后才可从env获取到server_list的shard信息 + initialize_shard_info(); + + return 0; +} + +#define CHECK_TABLE_EXIST(table, request, response) \ + if (table == NULL) { \ + std::string err_msg("table not found with table_id:"); \ + err_msg.append(std::to_string(request.table_id())); \ + set_response_code(response, -1, err_msg.c_str()); \ + return -1; \ + } + +int32_t GraphBrpcService::initialize_shard_info() { + if (!_is_initialize_shard_info) { + std::lock_guard guard(_initialize_shard_mutex); + if (_is_initialize_shard_info) { + return 0; + } + size_t shard_num = _server->environment()->get_ps_servers().size(); + auto &table_map = *(_server->table()); + for (auto itr : table_map) { + itr.second->set_shard(_rank, shard_num); + } + _is_initialize_shard_info = true; + } + return 0; +} + +void GraphBrpcService::service(google::protobuf::RpcController *cntl_base, + const PsRequestMessage *request, + PsResponseMessage *response, + google::protobuf::Closure *done) { + brpc::ClosureGuard done_guard(done); + std::string log_label("ReceiveCmd-"); + if (!request->has_table_id()) { + set_response_code(*response, -1, "PsRequestMessage.tabel_id is required"); + return; + } + + response->set_err_code(0); + response->set_err_msg(""); + auto *table = _server->table(request->table_id()); + brpc::Controller *cntl = static_cast(cntl_base); + auto itr = _service_handler_map.find(request->cmd_id()); + if (itr == _service_handler_map.end()) { + std::string err_msg( + "undefined cmd_id, should match PsCmdID in ps.proto, cmd_id:"); + err_msg.append(std::to_string(request->cmd_id())); + set_response_code(*response, -1, err_msg.c_str()); + return; + } + serviceFunc handler_func = itr->second; + int service_ret = (this->*handler_func)(table, *request, *response, cntl); + if (service_ret != 0) { + response->set_err_code(service_ret); + response->set_err_msg("server internal error"); + } +} + +int32_t GraphBrpcService::barrier(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + + if (request.params_size() < 1) { + set_response_code(response, -1, + "PsRequestMessage.params is requeired at " + "least 1 for num of sparse_key"); + return 0; + } + + auto trainer_id = request.client_id(); + auto barrier_type = request.params(0); + table->barrier(trainer_id, barrier_type); + return 0; +} + +int32_t GraphBrpcService::print_table_stat(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + std::pair ret = table->print_table_stat(); + paddle::framework::BinaryArchive ar; + ar << ret.first << ret.second; + std::string table_info(ar.Buffer(), ar.Length()); + response.set_data(table_info); + + return 0; +} + +int32_t GraphBrpcService::load_one_table(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + if (request.params_size() < 2) { + set_response_code( + response, -1, + "PsRequestMessage.datas is requeired at least 2 for path & load_param"); + return -1; + } + if (table->load(request.params(0), request.params(1)) != 0) { + set_response_code(response, -1, "table load failed"); + return -1; + } + return 0; +} + +int32_t GraphBrpcService::load_all_table(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + auto &table_map = *(_server->table()); + for (auto &itr : table_map) { + if (load_one_table(itr.second.get(), request, response, cntl) != 0) { + LOG(ERROR) << "load table[" << itr.first << "] failed"; + return -1; + } + } + return 0; +} + +int32_t GraphBrpcService::stop_server(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + GraphBrpcServer *p_server = (GraphBrpcServer *)_server; + std::thread t_stop([p_server]() { + p_server->stop(); + LOG(INFO) << "Server Stoped"; + }); + p_server->export_cv()->notify_all(); + t_stop.detach(); + return 0; +} + +int32_t GraphBrpcService::stop_profiler(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + platform::DisableProfiler(platform::EventSortingKey::kDefault, + string::Sprintf("server_%s_profile", _rank)); + return 0; +} + +int32_t GraphBrpcService::start_profiler(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + platform::EnableProfiler(platform::ProfilerState::kCPU); + return 0; +} + +int32_t GraphBrpcService::pull_graph_list(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + if (request.params_size() < 3) { + set_response_code(response, -1, + "pull_graph_list request requires at least 3 arguments"); + return 0; + } + int start = *(int *)(request.params(0).c_str()); + int size = *(int *)(request.params(1).c_str()); + int step = *(int *)(request.params(2).c_str()); + std::unique_ptr buffer; + int actual_size; + ((GraphTable *)table) + ->pull_graph_list(start, size, buffer, actual_size, false, step); + cntl->response_attachment().append(buffer.get(), actual_size); + return 0; +} +int32_t GraphBrpcService::graph_random_sample_neighboors( + Table *table, const PsRequestMessage &request, PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + if (request.params_size() < 2) { + set_response_code( + response, -1, + "graph_random_sample request requires at least 2 arguments"); + return 0; + } + size_t node_num = request.params(0).size() / sizeof(uint64_t); + uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); + int sample_size = *(uint64_t *)(request.params(1).c_str()); + std::vector> buffers(node_num); + std::vector actual_sizes(node_num, 0); + ((GraphTable *)table) + ->random_sample_neighboors(node_data, sample_size, buffers, actual_sizes); + + cntl->response_attachment().append(&node_num, sizeof(size_t)); + cntl->response_attachment().append(actual_sizes.data(), + sizeof(int) * node_num); + for (size_t idx = 0; idx < node_num; ++idx) { + cntl->response_attachment().append(buffers[idx].get(), actual_sizes[idx]); + } + return 0; +} +int32_t GraphBrpcService::graph_random_sample_nodes( + Table *table, const PsRequestMessage &request, PsResponseMessage &response, + brpc::Controller *cntl) { + size_t size = *(uint64_t *)(request.params(0).c_str()); + std::unique_ptr buffer; + int actual_size; + if (((GraphTable *)table)->random_sample_nodes(size, buffer, actual_size) == + 0) { + cntl->response_attachment().append(buffer.get(), actual_size); + } else + cntl->response_attachment().append(NULL, 0); + + return 0; +} + +int32_t GraphBrpcService::graph_get_node_feat(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + if (request.params_size() < 2) { + set_response_code( + response, -1, + "graph_get_node_feat request requires at least 2 arguments"); + return 0; + } + size_t node_num = request.params(0).size() / sizeof(uint64_t); + uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); + std::vector node_ids(node_data, node_data + node_num); + + std::vector feature_names = + paddle::string::split_string(request.params(1), "\t"); + + std::vector> feature( + feature_names.size(), std::vector(node_num)); + + ((GraphTable *)table)->get_node_feat(node_ids, feature_names, feature); + + for (size_t feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) { + for (size_t node_idx = 0; node_idx < node_num; ++node_idx) { + size_t feat_len = feature[feat_idx][node_idx].size(); + cntl->response_attachment().append(&feat_len, sizeof(size_t)); + cntl->response_attachment().append(feature[feat_idx][node_idx].data(), + feat_len); + } + } + + return 0; +} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/service/graph_brpc_server.h b/paddle/fluid/distributed/service/graph_brpc_server.h new file mode 100644 index 0000000000000000000000000000000000000000..32c572f9e6c2bf759c59190679bcf7570a807f2d --- /dev/null +++ b/paddle/fluid/distributed/service/graph_brpc_server.h @@ -0,0 +1,114 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "brpc/channel.h" +#include "brpc/controller.h" +#include "brpc/server.h" + +#include +#include +#include "paddle/fluid/distributed/service/brpc_ps_server.h" +#include "paddle/fluid/distributed/service/server.h" +#include "paddle/fluid/distributed/table/common_graph_table.h" +#include "paddle/fluid/distributed/table/table.h" +namespace paddle { +namespace distributed { +class GraphBrpcServer : public PSServer { + public: + GraphBrpcServer() {} + virtual ~GraphBrpcServer() {} + PsBaseService *get_service() { return _service.get(); } + virtual uint64_t start(const std::string &ip, uint32_t port); + virtual int32_t stop() { + std::unique_lock lock(mutex_); + if (stoped_) return 0; + stoped_ = true; + // cv_.notify_all(); + _server.Stop(1000); + _server.Join(); + return 0; + } + virtual int32_t port(); + + std::condition_variable *export_cv() { return &cv_; } + + private: + virtual int32_t initialize(); + mutable std::mutex mutex_; + std::condition_variable cv_; + bool stoped_ = false; + brpc::Server _server; + std::shared_ptr _service; + std::vector> _pserver_channels; +}; + +class GraphBrpcService; + +typedef int32_t (GraphBrpcService::*serviceFunc)( + Table *table, const PsRequestMessage &request, PsResponseMessage &response, + brpc::Controller *cntl); + +class GraphBrpcService : public PsBaseService { + public: + virtual int32_t initialize() override; + + virtual void service(::google::protobuf::RpcController *controller, + const PsRequestMessage *request, + PsResponseMessage *response, + ::google::protobuf::Closure *done) override; + + protected: + std::unordered_map _service_handler_map; + int32_t initialize_shard_info(); + int32_t pull_graph_list(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + int32_t graph_random_sample_neighboors(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl); + int32_t graph_random_sample_nodes(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl); + int32_t graph_get_node_feat(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl); + int32_t barrier(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + int32_t load_one_table(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + int32_t load_all_table(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + int32_t stop_server(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + int32_t start_profiler(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + int32_t stop_profiler(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + + int32_t print_table_stat(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + + private: + bool _is_initialize_shard_info; + std::mutex _initialize_shard_mutex; + std::unordered_map _msg_handler_map; + std::vector _ori_values; + const int sample_nodes_ranges = 23; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/service/graph_py_service.cc b/paddle/fluid/distributed/service/graph_py_service.cc new file mode 100644 index 0000000000000000000000000000000000000000..61e4e0cf7bb9155d25c630296c2b55a7d3400bfc --- /dev/null +++ b/paddle/fluid/distributed/service/graph_py_service.cc @@ -0,0 +1,325 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/service/graph_py_service.h" +#include // NOLINT +#include "butil/endpoint.h" +#include "iomanip" +#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/framework/archive.h" +#include "paddle/fluid/platform/profiler.h" +namespace paddle { +namespace distributed { +std::vector GraphPyService::split(std::string& str, + const char pattern) { + std::vector res; + std::stringstream input(str); + std::string temp; + while (std::getline(input, temp, pattern)) { + res.push_back(temp); + } + return res; +} + +void GraphPyService::add_table_feat_conf(std::string table_name, + std::string feat_name, + std::string feat_dtype, + int32_t feat_shape) { + if (this->table_id_map.count(table_name)) { + this->table_feat_conf_table_name.push_back(table_name); + this->table_feat_conf_feat_name.push_back(feat_name); + this->table_feat_conf_feat_dtype.push_back(feat_dtype); + this->table_feat_conf_feat_shape.push_back(feat_shape); + } +} + +void GraphPyService::set_up(std::string ips_str, int shard_num, + std::vector node_types, + std::vector edge_types) { + set_shard_num(shard_num); + set_num_node_types(node_types.size()); + + for (size_t table_id = 0; table_id < node_types.size(); table_id++) { + this->table_id_map[node_types[table_id]] = this->table_id_map.size(); + } + for (size_t table_id = 0; table_id < edge_types.size(); table_id++) { + this->table_id_map[edge_types[table_id]] = this->table_id_map.size(); + } + std::istringstream stream(ips_str); + std::string ip; + server_size = 0; + std::vector ips_list = split(ips_str, ';'); + int index = 0; + for (auto ips : ips_list) { + auto ip_and_port = split(ips, ':'); + server_list.push_back(ip_and_port[0]); + port_list.push_back(ip_and_port[1]); + uint32_t port = stoul(ip_and_port[1]); + auto ph_host = paddle::distributed::PSHost(ip_and_port[0], port, index); + host_sign_list.push_back(ph_host.serialize_to_string()); + index++; + } +} +void GraphPyClient::start_client() { + std::map> dense_regions; + dense_regions.insert( + std::pair>(0, {})); + auto regions = dense_regions[0]; + ::paddle::distributed::PSParameter worker_proto = GetWorkerProto(); + paddle::distributed::PaddlePSEnvironment _ps_env; + auto servers_ = host_sign_list.size(); + _ps_env = paddle::distributed::PaddlePSEnvironment(); + _ps_env.set_ps_servers(&host_sign_list, servers_); + worker_ptr = std::shared_ptr( + (paddle::distributed::GraphBrpcClient*) + paddle::distributed::PSClientFactory::create(worker_proto)); + worker_ptr->configure(worker_proto, dense_regions, _ps_env, client_id); + worker_ptr->set_shard_num(get_shard_num()); +} +void GraphPyServer::start_server(bool block) { + std::string ip = server_list[rank]; + uint32_t port = std::stoul(port_list[rank]); + ::paddle::distributed::PSParameter server_proto = this->GetServerProto(); + + auto _ps_env = paddle::distributed::PaddlePSEnvironment(); + _ps_env.set_ps_servers(&this->host_sign_list, + this->host_sign_list.size()); // test + pserver_ptr = std::shared_ptr( + (paddle::distributed::GraphBrpcServer*) + paddle::distributed::PSServerFactory::create(server_proto)); + VLOG(0) << "pserver-ptr created "; + std::vector empty_vec; + framework::ProgramDesc empty_prog; + empty_vec.push_back(empty_prog); + pserver_ptr->configure(server_proto, _ps_env, rank, empty_vec); + pserver_ptr->start(ip, port); + std::condition_variable* cv_ = pserver_ptr->export_cv(); + if (block) { + std::mutex mutex_; + std::unique_lock lock(mutex_); + cv_->wait(lock); + } +} +::paddle::distributed::PSParameter GraphPyServer::GetServerProto() { + // Generate server proto desc + ::paddle::distributed::PSParameter server_fleet_desc; + ::paddle::distributed::ServerParameter* server_proto = + server_fleet_desc.mutable_server_param(); + ::paddle::distributed::DownpourServerParameter* downpour_server_proto = + server_proto->mutable_downpour_server_param(); + ::paddle::distributed::ServerServiceParameter* server_service_proto = + downpour_server_proto->mutable_service_param(); + server_service_proto->set_service_class("GraphBrpcService"); + server_service_proto->set_server_class("GraphBrpcServer"); + server_service_proto->set_client_class("GraphBrpcClient"); + server_service_proto->set_start_server_port(0); + server_service_proto->set_server_thread_num(12); + + for (auto& tuple : this->table_id_map) { + VLOG(0) << " make a new table " << tuple.second; + ::paddle::distributed::TableParameter* sparse_table_proto = + downpour_server_proto->add_downpour_table_param(); + std::vector feat_name; + std::vector feat_dtype; + std::vector feat_shape; + for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) { + if (tuple.first == table_feat_conf_table_name[i]) { + feat_name.push_back(table_feat_conf_feat_name[i]); + feat_dtype.push_back(table_feat_conf_feat_dtype[i]); + feat_shape.push_back(table_feat_conf_feat_shape[i]); + } + } + std::string table_type; + if (tuple.second < this->num_node_types) { + table_type = "node"; + } else { + table_type = "edge"; + } + + GetDownpourSparseTableProto(sparse_table_proto, tuple.second, tuple.first, + table_type, feat_name, feat_dtype, feat_shape); + } + + return server_fleet_desc; +} + +::paddle::distributed::PSParameter GraphPyClient::GetWorkerProto() { + ::paddle::distributed::PSParameter worker_fleet_desc; + ::paddle::distributed::WorkerParameter* worker_proto = + worker_fleet_desc.mutable_worker_param(); + + ::paddle::distributed::DownpourWorkerParameter* downpour_worker_proto = + worker_proto->mutable_downpour_worker_param(); + + for (auto& tuple : this->table_id_map) { + VLOG(0) << " make a new table " << tuple.second; + ::paddle::distributed::TableParameter* worker_sparse_table_proto = + downpour_worker_proto->add_downpour_table_param(); + std::vector feat_name; + std::vector feat_dtype; + std::vector feat_shape; + for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) { + if (tuple.first == table_feat_conf_table_name[i]) { + feat_name.push_back(table_feat_conf_feat_name[i]); + feat_dtype.push_back(table_feat_conf_feat_dtype[i]); + feat_shape.push_back(table_feat_conf_feat_shape[i]); + } + } + std::string table_type; + if (tuple.second < this->num_node_types) { + table_type = "node"; + } else { + table_type = "edge"; + } + + GetDownpourSparseTableProto(worker_sparse_table_proto, tuple.second, + tuple.first, table_type, feat_name, feat_dtype, + feat_shape); + } + + ::paddle::distributed::ServerParameter* server_proto = + worker_fleet_desc.mutable_server_param(); + ::paddle::distributed::DownpourServerParameter* downpour_server_proto = + server_proto->mutable_downpour_server_param(); + ::paddle::distributed::ServerServiceParameter* server_service_proto = + downpour_server_proto->mutable_service_param(); + server_service_proto->set_service_class("GraphBrpcService"); + server_service_proto->set_server_class("GraphBrpcServer"); + server_service_proto->set_client_class("GraphBrpcClient"); + server_service_proto->set_start_server_port(0); + server_service_proto->set_server_thread_num(12); + + for (auto& tuple : this->table_id_map) { + VLOG(0) << " make a new table " << tuple.second; + ::paddle::distributed::TableParameter* sparse_table_proto = + downpour_server_proto->add_downpour_table_param(); + std::vector feat_name; + std::vector feat_dtype; + std::vector feat_shape; + for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) { + if (tuple.first == table_feat_conf_table_name[i]) { + feat_name.push_back(table_feat_conf_feat_name[i]); + feat_dtype.push_back(table_feat_conf_feat_dtype[i]); + feat_shape.push_back(table_feat_conf_feat_shape[i]); + } + } + std::string table_type; + if (tuple.second < this->num_node_types) { + table_type = "node"; + } else { + table_type = "edge"; + } + + GetDownpourSparseTableProto(sparse_table_proto, tuple.second, tuple.first, + table_type, feat_name, feat_dtype, feat_shape); + } + + return worker_fleet_desc; +} +void GraphPyClient::load_edge_file(std::string name, std::string filepath, + bool reverse) { + // 'e' means load edge + std::string params = "e"; + if (reverse) { + // 'e<' means load edges from $2 to $1 + params += "<"; + } else { + // 'e>' means load edges from $1 to $2 + params += ">"; + } + if (this->table_id_map.count(name)) { + VLOG(0) << "loadding data with type " << name << " from " << filepath; + uint32_t table_id = this->table_id_map[name]; + auto status = + get_ps_client()->load(table_id, std::string(filepath), params); + status.wait(); + } +} + +void GraphPyClient::load_node_file(std::string name, std::string filepath) { + // 'n' means load nodes and 'node_type' follows + std::string params = "n" + name; + if (this->table_id_map.count(name)) { + uint32_t table_id = this->table_id_map[name]; + auto status = + get_ps_client()->load(table_id, std::string(filepath), params); + status.wait(); + } +} +std::vector>> +GraphPyClient::batch_sample_neighboors(std::string name, + std::vector node_ids, + int sample_size) { + std::vector>> v; + if (this->table_id_map.count(name)) { + uint32_t table_id = this->table_id_map[name]; + auto status = + worker_ptr->batch_sample_neighboors(table_id, node_ids, sample_size, v); + status.wait(); + } + return v; +} + +std::vector GraphPyClient::random_sample_nodes(std::string name, + int server_index, + int sample_size) { + std::vector v; + if (this->table_id_map.count(name)) { + uint32_t table_id = this->table_id_map[name]; + auto status = + worker_ptr->random_sample_nodes(table_id, server_index, sample_size, v); + status.wait(); + } + return v; +} + +// (name, dtype, ndarray) +std::vector> GraphPyClient::get_node_feat( + std::string node_type, std::vector node_ids, + std::vector feature_names) { + std::vector> v( + feature_names.size(), std::vector(node_ids.size())); + if (this->table_id_map.count(node_type)) { + uint32_t table_id = this->table_id_map[node_type]; + auto status = + worker_ptr->get_node_feat(table_id, node_ids, feature_names, v); + status.wait(); + } + return v; +} + +std::vector GraphPyClient::pull_graph_list(std::string name, + int server_index, + int start, int size, + int step) { + std::vector res; + if (this->table_id_map.count(name)) { + uint32_t table_id = this->table_id_map[name]; + auto status = worker_ptr->pull_graph_list(table_id, server_index, start, + size, step, res); + status.wait(); + } + return res; +} + +void GraphPyClient::stop_server() { + VLOG(0) << "going to stop server"; + std::unique_lock lock(mutex_); + if (stoped_) return; + auto status = this->worker_ptr->stop_server(); + if (status.get() == 0) stoped_ = true; +} +void GraphPyClient::finalize_worker() { this->worker_ptr->finalize_worker(); } +} +} diff --git a/paddle/fluid/distributed/service/graph_py_service.h b/paddle/fluid/distributed/service/graph_py_service.h new file mode 100644 index 0000000000000000000000000000000000000000..c6657be96ba446d2f7538943aab43dd47e1868fb --- /dev/null +++ b/paddle/fluid/distributed/service/graph_py_service.h @@ -0,0 +1,166 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include // NOLINT +#include +#include +#include +#include +#include +#include // NOLINT +#include +#include +#include "google/protobuf/text_format.h" + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" + +#include "paddle/fluid/distributed/ps.pb.h" +#include "paddle/fluid/distributed/service/env.h" +#include "paddle/fluid/distributed/service/graph_brpc_client.h" +#include "paddle/fluid/distributed/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/service/service.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/printf.h" +namespace paddle { +namespace distributed { +class GraphPyService { + protected: + std::vector server_list, port_list, host_sign_list; + int server_size, shard_num; + int num_node_types; + std::unordered_map table_id_map; + std::vector table_feat_conf_table_name; + std::vector table_feat_conf_feat_name; + std::vector table_feat_conf_feat_dtype; + std::vector table_feat_conf_feat_shape; + + public: + int get_shard_num() { return shard_num; } + void set_shard_num(int shard_num) { this->shard_num = shard_num; } + void GetDownpourSparseTableProto( + ::paddle::distributed::TableParameter* sparse_table_proto, + uint32_t table_id, std::string table_name, std::string table_type, + std::vector feat_name, std::vector feat_dtype, + std::vector feat_shape) { + sparse_table_proto->set_table_id(table_id); + sparse_table_proto->set_table_class("GraphTable"); + sparse_table_proto->set_shard_num(shard_num); + sparse_table_proto->set_type(::paddle::distributed::PS_SPARSE_TABLE); + ::paddle::distributed::TableAccessorParameter* accessor_proto = + sparse_table_proto->mutable_accessor(); + + ::paddle::distributed::CommonAccessorParameter* common_proto = + sparse_table_proto->mutable_common(); + + // Set GraphTable Parameter + common_proto->set_table_name(table_name); + common_proto->set_name(table_type); + for (size_t i = 0; i < feat_name.size(); i++) { + common_proto->add_params(feat_dtype[i]); + common_proto->add_dims(feat_shape[i]); + common_proto->add_attributes(feat_name[i]); + } + + accessor_proto->set_accessor_class("CommMergeAccessor"); + } + + void set_server_size(int server_size) { this->server_size = server_size; } + void set_num_node_types(int num_node_types) { + this->num_node_types = num_node_types; + } + int get_server_size(int server_size) { return server_size; } + std::vector split(std::string& str, const char pattern); + void set_up(std::string ips_str, int shard_num, + std::vector node_types, + std::vector edge_types); + + void add_table_feat_conf(std::string node_type, std::string feat_name, + std::string feat_dtype, int32_t feat_shape); +}; +class GraphPyServer : public GraphPyService { + public: + GraphPyServer() {} + void set_up(std::string ips_str, int shard_num, + std::vector node_types, + std::vector edge_types, int rank) { + set_rank(rank); + GraphPyService::set_up(ips_str, shard_num, node_types, edge_types); + } + int get_rank() { return rank; } + void set_rank(int rank) { this->rank = rank; } + + void start_server(bool block = true); + ::paddle::distributed::PSParameter GetServerProto(); + std::shared_ptr get_ps_server() { + return pserver_ptr; + } + + protected: + int rank; + std::shared_ptr pserver_ptr; + std::thread* server_thread; +}; +class GraphPyClient : public GraphPyService { + public: + void set_up(std::string ips_str, int shard_num, + std::vector node_types, + std::vector edge_types, int client_id) { + set_client_id(client_id); + GraphPyService::set_up(ips_str, shard_num, node_types, edge_types); + } + std::shared_ptr get_ps_client() { + return worker_ptr; + } + void bind_local_server(int local_channel_index, GraphPyServer& server) { + worker_ptr->set_local_channel(local_channel_index); + worker_ptr->set_local_graph_service( + (paddle::distributed::GraphBrpcService*)server.get_ps_server() + ->get_service()); + } + void stop_server(); + void finalize_worker(); + void load_edge_file(std::string name, std::string filepath, bool reverse); + void load_node_file(std::string name, std::string filepath); + int get_client_id() { return client_id; } + void set_client_id(int client_id) { this->client_id = client_id; } + void start_client(); + std::vector>> batch_sample_neighboors( + std::string name, std::vector node_ids, int sample_size); + std::vector random_sample_nodes(std::string name, int server_index, + int sample_size); + std::vector> get_node_feat( + std::string node_type, std::vector node_ids, + std::vector feature_names); + std::vector pull_graph_list(std::string name, int server_index, + int start, int size, int step = 1); + ::paddle::distributed::PSParameter GetWorkerProto(); + + protected: + mutable std::mutex mutex_; + int client_id; + std::shared_ptr worker_ptr; + std::thread* client_thread; + bool stoped_ = false; +}; +} +} diff --git a/paddle/fluid/distributed/service/ps_client.cc b/paddle/fluid/distributed/service/ps_client.cc index 095b5dee0b28e4f3319927aa2440e906489db7de..d45f41a0f58de36bb1575c1b51663f8899fb215d 100644 --- a/paddle/fluid/distributed/service/ps_client.cc +++ b/paddle/fluid/distributed/service/ps_client.cc @@ -15,11 +15,15 @@ #include "paddle/fluid/distributed/service/ps_client.h" #include "glog/logging.h" #include "paddle/fluid/distributed/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/service/graph_brpc_client.h" +#include "paddle/fluid/distributed/service/ps_local_client.h" #include "paddle/fluid/distributed/table/table.h" namespace paddle { namespace distributed { REGISTER_PSCORE_CLASS(PSClient, BrpcPsClient); +REGISTER_PSCORE_CLASS(PSClient, PsLocalClient); +REGISTER_PSCORE_CLASS(PSClient, GraphBrpcClient); int32_t PSClient::configure( const PSParameter &config, @@ -78,8 +82,7 @@ PSClient *PSClientFactory::create(const PSParameter &ps_config) { } TableManager::instance().initialize(); - LOG(INFO) << "Create PSClient[" << service_param.client_class() - << "] success"; + VLOG(3) << "Create PSClient[" << service_param.client_class() << "] success"; return client; } } // namespace distributed diff --git a/paddle/fluid/distributed/service/ps_client.h b/paddle/fluid/distributed/service/ps_client.h index 50f5802c63a2538566988f75e3c098bd01785294..74a1e0dde71fc4a3dd7af17968502131556d0518 100644 --- a/paddle/fluid/distributed/service/ps_client.h +++ b/paddle/fluid/distributed/service/ps_client.h @@ -24,16 +24,11 @@ #include "paddle/fluid/distributed/service/env.h" #include "paddle/fluid/distributed/service/sendrecv.pb.h" #include "paddle/fluid/distributed/table/accessor.h" +#include "paddle/fluid/distributed/table/graph/graph_node.h" namespace paddle { namespace distributed { -class PSEnvironment; -class PsRequestMessage; -class PsResponseMessage; -class ValueAccessor; -struct Region; - using paddle::distributed::PsRequestMessage; using paddle::distributed::PsResponseMessage; @@ -117,10 +112,22 @@ class PSClient { // future结束前keys和values缓冲区不能再次使用 // 整合多个线程请求的keys,聚集并分散发送到server // 返回结果后,遍历buffer并对values赋值 + // is_training 用于区分请求是训练/预测,server端对于特征和准入会有不同的处理. virtual std::future pull_sparse(float **select_values, size_t table_id, - const uint64_t *keys, - size_t num) = 0; + const uint64_t *keys, size_t num, + bool is_training) = 0; + + virtual ::std::future pull_sparse_ptr(char **select_values, + size_t table_id, + const uint64_t *keys, + size_t num) { + VLOG(0) << "Did not implement"; + std::promise promise; + std::future fut = promise.get_future(); + promise.set_value(-1); + return fut; + } virtual std::future print_table_stat(uint32_t table_id) = 0; @@ -154,12 +161,13 @@ class PSClient { virtual std::future send_client2client_msg(int msg_type, int to_client_id, const std::string &msg) { - LOG(FATAL) << "Did not implement"; + VLOG(0) << "Did not implement"; std::promise promise; std::future fut = promise.get_future(); promise.set_value(-1); return fut; } + // client2client消息处理,std::function ret (msg_type, from_client_id, msg) typedef std::function MsgHandlerFunc; diff --git a/paddle/fluid/distributed/service/ps_local_client.cc b/paddle/fluid/distributed/service/ps_local_client.cc new file mode 100644 index 0000000000000000000000000000000000000000..2acc845a50890beb834676c3394f8dabc2a77e78 --- /dev/null +++ b/paddle/fluid/distributed/service/ps_local_client.cc @@ -0,0 +1,269 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/service/ps_local_client.h" +#include "paddle/fluid/distributed/table/table.h" + +//#define pslib_debug_dense_compress + +namespace paddle { +namespace distributed { +int32_t PsLocalClient::initialize() { + const auto& downpour_param = _config.server_param().downpour_server_param(); + TableManager::instance().initialize(); + for (size_t i = 0; i < downpour_param.downpour_table_param_size(); ++i) { + auto* table = CREATE_PSCORE_CLASS( + Table, downpour_param.downpour_table_param(i).table_class()); + table->initialize(downpour_param.downpour_table_param(i), + _config.fs_client_param()); + table->set_shard(0, 1); + _table_map[downpour_param.downpour_table_param(i).table_id()].reset(table); + } + return 0; +} + +::std::future PsLocalClient::shrink(uint32_t table_id, + const std::string threshold) { + // TODO + return done(); +} + +::std::future PsLocalClient::load(const std::string& epoch, + const std::string& mode) { + // TODO + // for (auto& it : _table_map) { + // load(it.first, epoch, mode); + //} + return done(); +} +::std::future PsLocalClient::load(uint32_t table_id, + const std::string& epoch, + const std::string& mode) { + // TODO + // auto* table_ptr = table(table_id); + // table_ptr->load(epoch, mode); + return done(); +} + +::std::future PsLocalClient::save(const std::string& epoch, + const std::string& mode) { + // TODO + for (auto& it : _table_map) { + save(it.first, epoch, mode); + } + return done(); +} +::std::future PsLocalClient::save(uint32_t table_id, + const std::string& epoch, + const std::string& mode) { + // TODO + auto* table_ptr = table(table_id); + table_ptr->flush(); + table_ptr->save(epoch, mode); + return done(); +} + +::std::future PsLocalClient::clear() { + // TODO + return done(); +} +::std::future PsLocalClient::clear(uint32_t table_id) { + // TODO + return done(); +} + +::std::future PsLocalClient::flush() { + // no need + return done(); +} + +::std::future PsLocalClient::stop_server() { + // no need + return done(); +} + +::std::future PsLocalClient::pull_dense(Region* regions, + size_t region_num, + size_t table_id) { + auto* accessor = table_accessor(table_id); + auto* table_ptr = table(table_id); + + uint32_t num_per_shard = dense_dim_per_shard(accessor->fea_dim(), 1); + std::vector region_buffer; + region_buffer.resize(num_per_shard); + table_ptr->pull_dense(region_buffer.data(), region_buffer.size()); + + size_t region_idx = 0; + size_t region_data_idx = 0; + size_t shard_data_size = num_per_shard; + size_t shard_buffer_remain = shard_data_size * sizeof(float); + PADDLE_ENFORCE_EQ( + shard_buffer_remain, region_buffer.size() * sizeof(float), + platform::errors::PreconditionNotMet("pull dense size error.")); + size_t index = 0; + while (shard_buffer_remain > 0 && region_idx < region_num) { + auto& region = regions[region_idx]; + if (region.size - region_data_idx >= shard_buffer_remain) { + memcpy((void*)(region.data + region_data_idx), + (uint8_t*)(void*)(region_buffer.data()) + index, + shard_buffer_remain); + region_data_idx += shard_buffer_remain; + shard_buffer_remain = 0; + } else if (region.size - region_data_idx == 0) { + ++region_idx; + region_data_idx = 0; + } else { + memcpy((void*)(region.data + region_data_idx), + (uint8_t*)(void*)(region_buffer.data()) + index, + region.size - region_data_idx); + shard_buffer_remain -= (region.size - region_data_idx); + index += (region.size - region_data_idx); + ++region_idx; + region_data_idx = 0; + } + } + + return done(); +} + +::std::future PsLocalClient::push_dense_param(const Region* regions, + size_t region_num, + size_t table_id) { + auto* accessor = table_accessor(table_id); + auto* table_ptr = table(table_id); + + std::vector region_buffer; + region_buffer.resize(dense_dim_per_shard(accessor->fea_dim(), 1), 0); + for (size_t i = 0, offset = 0; i < region_num; ++i) { + uint32_t data_num = regions[i].size / sizeof(float); + memcpy(region_buffer.data() + offset, regions[i].data, regions[i].size); + offset += data_num; + } + + // table_ptr->push_dense_param(region_buffer.data(), region_buffer.size()); + + return done(); +} + +::std::future PsLocalClient::push_dense_raw_gradient( + int table_id, float* total_send_data, size_t total_send_data_size, + void* callback) { + VLOG(1) << "wxx push_dense_raw_gradient"; + + PSClientClosure* closure = reinterpret_cast(callback); + + auto* table_ptr = table(table_id); + + table_ptr->push_dense(total_send_data, total_send_data_size); + delete closure; + return done(); +} + +::std::future PsLocalClient::push_dense(const Region* regions, + size_t region_num, + size_t table_id) { + auto* accessor = table_accessor(table_id); + auto* table_ptr = table(table_id); + + std::vector region_buffer; + region_buffer.resize(dense_dim_per_shard(accessor->fea_dim(), 1)); + size_t data_size = region_buffer.size(); + for (size_t i = 0, offset = 0; i < region_num; ++i) { + uint32_t data_num = regions[i].size / sizeof(float); + PADDLE_ENFORCE_LE( + offset + data_num, data_size, + platform::errors::PreconditionNotMet( + "invalid dense size, cur pos[%d] data_num[%d] size[%d]", offset, + data_num, data_size)); + memcpy(region_buffer.data() + offset, regions[i].data, regions[i].size); + offset += data_num; + } + + table_ptr->push_dense(region_buffer.data(), region_buffer.size()); + + return done(); +} + +//::std::future PsLocalClient::pull_sparse(float** select_values, +// size_t table_id, +// const uint64_t* keys, +// size_t num) { +// // FIXME +// // auto timer = +// // std::make_shared("pslib_downpour_client_pull_sparse"); +// // auto local_timer = +// // std::make_shared("pslib_downpour_client_pull_sparse_local"); +// //将key拆分到各shard请求,并记录原始对应value指针 +// auto* accessor = table_accessor(table_id); +// auto* table_ptr = table(table_id); +// size_t value_size = accessor->select_size(); +// +// // table_ptr->pull_sparse(keys, num); +// std::vector res_data; +// res_data.resize(num * value_size / sizeof(float)); +// table_ptr->pull_sparse(res_data.data(), keys, num); +// // memcpy(select_values[0], res_data->data(), res_data->size() * +// // sizeof(float)); +// size_t offset = 0; +// for (int i = 0; i < num; ++i) { +// memcpy(select_values[i], (char*)res_data.data() + offset, value_size); +// offset += value_size; +// } +// +// // return fut; +// return done(); +//} + +::std::future PsLocalClient::pull_sparse_ptr(char** select_values, + size_t table_id, + const uint64_t* keys, + size_t num) { + // FIXME + // auto timer = + // std::make_shared("pslib_downpour_client_pull_sparse"); + // auto local_timer = + // std::make_shared("pslib_downpour_client_pull_sparse_local"); + //将key拆分到各shard请求,并记录原始对应value指针 + auto* table_ptr = table(table_id); + + table_ptr->pull_sparse_ptr(select_values, keys, num); + + return done(); +} + +::std::future PsLocalClient::push_sparse_raw_gradient( + size_t table_id, const uint64_t* keys, const float** update_values, + size_t num, void* callback) { + VLOG(1) << "wxx push_sparse_raw_gradient"; + PSClientClosure* closure = reinterpret_cast(callback); + auto* accessor = table_accessor(table_id); + auto* table_ptr = table(table_id); + + table_ptr->push_sparse(keys, update_values, num); + delete closure; + return done(); +} + +::std::future PsLocalClient::push_sparse(size_t table_id, + const uint64_t* keys, + const float** update_values, + size_t num) { + auto* accessor = table_accessor(table_id); + auto* table_ptr = table(table_id); + + table_ptr->push_sparse(keys, update_values, num); + return done(); +} +} +} diff --git a/paddle/fluid/distributed/service/ps_local_client.h b/paddle/fluid/distributed/service/ps_local_client.h new file mode 100644 index 0000000000000000000000000000000000000000..9d2b01a45fe929097c06fb264f470974410e7f4e --- /dev/null +++ b/paddle/fluid/distributed/service/ps_local_client.h @@ -0,0 +1,226 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License 0// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "brpc/channel.h" +#include "brpc/controller.h" +#include "brpc/server.h" +#include "paddle/fluid/distributed/service/ps_client.h" + +namespace paddle { +namespace distributed { + +class Table; + +class PsLocalClient : public PSClient { + public: + PsLocalClient() {} + virtual ~PsLocalClient() { _running = false; } + virtual int32_t create_client2client_connection(int pslib_timeout_ms, + int pslib_connect_timeout_ms, + int max_retry) { + return 0; + } + + virtual ::std::future shrink(uint32_t table_id, + const std::string threshold) override; + virtual ::std::future load(const std::string& epoch, + const std::string& mode) override; + virtual ::std::future load(uint32_t table_id, + const std::string& epoch, + const std::string& mode) override; + + virtual ::std::future save(const std::string& epoch, + const std::string& mode) override; + virtual ::std::future save(uint32_t table_id, + const std::string& epoch, + const std::string& mode) override; + + virtual ::std::future clear() override; + virtual ::std::future clear(uint32_t table_id) override; + + virtual ::std::future stop_server() override; + + virtual void finalize_worker() override {} + virtual ::std::future pull_dense(Region* regions, size_t region_num, + size_t table_id); + + virtual ::std::future push_dense(const Region* regions, + size_t region_num, size_t table_id); + + virtual ::std::future push_dense_param(const Region* regions, + size_t region_num, + size_t table_id); + + virtual ::std::future pull_sparse(float** select_values, + size_t table_id, + const uint64_t* keys, size_t num, + bool is_training) { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + virtual ::std::future pull_sparse_ptr(char** select_values, + size_t table_id, + const uint64_t* keys, + size_t num); + + virtual ::std::future print_table_stat(uint32_t table_id) { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + virtual ::std::future push_sparse(size_t table_id, + const uint64_t* keys, + const float** update_values, + size_t num); + + virtual ::std::future flush(); + // server profilera + virtual std::future start_profiler() { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + }; + + virtual std::future stop_profiler() { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + virtual std::future barrier(size_t table_id, uint32_t barrier_type) { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + virtual std::future pull_geo_param(size_t table_id, + std::vector* values, + std::vector* keys, + int pserver_idx) { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + virtual std::future push_global_step(int table_id, + int64_t* total_send_data, + void* done) { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + // recv table from server and save it in LodTensor + virtual int32_t recv_and_save_table(const uint64_t table_id, + const std::string& path) { + return 0; + } + + virtual ::std::future send_client2client_msg( + int msg_type, int to_client_id, const std::string& msg) override { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + virtual size_t get_server_nums() { return 1; } + + virtual std::future push_dense_raw_gradient( + int table_id, float* total_send_data, size_t total_send_data_size, + void* callback) override; + + virtual std::future push_sparse_raw_gradient( + size_t table_id, const uint64_t* keys, const float** update_values, + size_t num, void* callback) override; + + virtual std::future push_sparse_raw_gradient_partial( + size_t table_id, const uint64_t* keys, const float** update_values, + uint32_t num, void* done, int pserver_idx) override { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + virtual std::future push_sparse_param(size_t table_id, + const uint64_t* keys, + const float** update_values, + size_t num, + void* done) override { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + private: + virtual int32_t initialize() override; + + std::future done() { + std::shared_ptr> prom = + std::make_shared>(); + std::future fut = prom->get_future(); + prom->set_value(0); + return fut; + } + + inline uint32_t dense_dim_per_shard(uint32_t dense_dim_total, + uint32_t shard_num) { + return dense_dim_total / shard_num + 1; + } + + inline std::unordered_map>* table() { + return &_table_map; + } + + inline Table* table(size_t table_id) { + auto itr = _table_map.find(table_id); + if (itr != _table_map.end()) { + return itr->second.get(); + } + LOG(ERROR) << "table not found " << table_id; + return NULL; + } + + std::unordered_map> _table_map; + + bool _running = false; + bool _flushing = false; + + private: + float _mae = 0; + float _mse = 0; + uint16_t _push_times = 0; +}; +} +} diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/distributed/service/ps_local_server.h similarity index 56% rename from paddle/fluid/operators/distributed/parameter_send.h rename to paddle/fluid/distributed/service/ps_local_server.h index 4335ef8c73cc0a3f4d019cbfe9be078a88914217..dfbccc70900e3cf10fbb0852a114e400d738e2d6 100644 --- a/paddle/fluid/operators/distributed/parameter_send.h +++ b/paddle/fluid/distributed/service/ps_local_server.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,22 +14,24 @@ #pragma once -#include +#include #include - -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" +#include "paddle/fluid/distributed/service/server.h" namespace paddle { -namespace operators { namespace distributed { -template -struct ParameterSend { - void operator()(const CommContext &rpc_ctx, const framework::Scope &scope, - bool sync, int multi_parts); -}; +class PsLocalServer : public PSServer { + public: + PsLocalServer() {} + virtual ~PsLocalServer() {} + virtual uint64_t start() { return 0; } + virtual uint64_t start(const std::string& ip, uint32_t port) { return 0; } + virtual int32_t stop() { return 0; } + virtual int32_t port() { return 0; } -}; // namespace distributed -}; // namespace operators -}; // namespace paddle + private: + virtual int32_t initialize() { return 0; } +}; +} +} diff --git a/paddle/fluid/distributed/service/sendrecv.proto b/paddle/fluid/distributed/service/sendrecv.proto index 6250f84c98754d31b6f0a4cf6689e4a560549f2c..d908c26da9870a93d81c0242ac03e26cfebdb976 100644 --- a/paddle/fluid/distributed/service/sendrecv.proto +++ b/paddle/fluid/distributed/service/sendrecv.proto @@ -48,6 +48,10 @@ enum PsCmdID { PS_START_PROFILER = 27; PS_STOP_PROFILER = 28; PS_PUSH_GLOBAL_STEP = 29; + PS_PULL_GRAPH_LIST = 30; + PS_GRAPH_SAMPLE_NEIGHBOORS = 31; + PS_GRAPH_SAMPLE_NODES = 32; + PS_GRAPH_GET_NODE_FEAT = 33; } message PsRequestMessage { @@ -111,4 +115,4 @@ message MultiVariableMessage { service PsService { rpc service(PsRequestMessage) returns (PsResponseMessage); rpc SendAndRecvVariable(MultiVariableMessage) returns (MultiVariableMessage); -}; \ No newline at end of file +}; diff --git a/paddle/fluid/distributed/service/server.cc b/paddle/fluid/distributed/service/server.cc index fc230a0b9c92e646f3dc87231effb7462f2340b6..e44876e3d2b789580152626ea8c290db0d369509 100644 --- a/paddle/fluid/distributed/service/server.cc +++ b/paddle/fluid/distributed/service/server.cc @@ -16,13 +16,18 @@ #include "glog/logging.h" #include "paddle/fluid/distributed/service/brpc_ps_server.h" +#include "paddle/fluid/distributed/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/service/ps_local_server.h" #include "paddle/fluid/distributed/table/table.h" namespace paddle { namespace distributed { REGISTER_PSCORE_CLASS(PSServer, BrpcPsServer); +REGISTER_PSCORE_CLASS(PSServer, PsLocalServer); REGISTER_PSCORE_CLASS(PsBaseService, BrpcPsService); +REGISTER_PSCORE_CLASS(PSServer, GraphBrpcServer); +REGISTER_PSCORE_CLASS(PsBaseService, GraphBrpcService); PSServer *PSServerFactory::create(const PSParameter &ps_config) { const auto &config = ps_config.server_param(); diff --git a/paddle/fluid/distributed/service/service.cc b/paddle/fluid/distributed/service/service.cc index 3d0f94fac277509be7b17b648767dc835629b2b3..2759e4614e66e1d69c6427e0320ae44292757ffd 100644 --- a/paddle/fluid/distributed/service/service.cc +++ b/paddle/fluid/distributed/service/service.cc @@ -47,7 +47,7 @@ paddle::distributed::PSParameter load_from_prototxt( } void PSCore::init_gflag(const std::string& gflags) { - LOG(INFO) << "Init With Gflags:" << gflags; + VLOG(3) << "Init With Gflags:" << gflags; std::vector flags = paddle::string::split_string(gflags); if (flags.size() < 1) { flags.push_back("-max_body_size=314217728"); diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/table/CMakeLists.txt index 1e98e193d54ae6dc6e8a2d9981071283354e7f98..dab390958034af284baaffcb909d8b941fc3b9d1 100644 --- a/paddle/fluid/distributed/table/CMakeLists.txt +++ b/paddle/fluid/distributed/table/CMakeLists.txt @@ -1,13 +1,23 @@ set_property(GLOBAL PROPERTY TABLE_DEPS string_helper) - +set(graphDir graph) get_property(TABLE_DEPS GLOBAL PROPERTY TABLE_DEPS) - +set_source_files_properties(${graphDir}/graph_edge.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library(graph_edge SRCS ${graphDir}/graph_edge.cc) +set_source_files_properties(${graphDir}/graph_weighted_sampler.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library(WeightedSampler SRCS ${graphDir}/graph_weighted_sampler.cc DEPS graph_edge) +set_source_files_properties(${graphDir}/graph_node.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library(graph_node SRCS ${graphDir}/graph_node.cc DEPS WeightedSampler) set_source_files_properties(common_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(common_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(barrier_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties(common_graph_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + +get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) -cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc DEPS ${TABLE_DEPS} device_context string_helper simple_threadpool xxhash generator) +cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc +sparse_geo_table.cc barrier_table.cc common_graph_table.cc DEPS ${TABLE_DEPS} +${RPC_DEPS} graph_edge graph_node device_context string_helper simple_threadpool xxhash generator) set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(tensor_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc new file mode 100644 index 0000000000000000000000000000000000000000..0dc99de1bfe82a691fdacb834acd1ad606dcb04b --- /dev/null +++ b/paddle/fluid/distributed/table/common_graph_table.cc @@ -0,0 +1,506 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/common_graph_table.h" +#include +#include +#include +#include +#include "paddle/fluid/distributed/common/utils.h" +#include "paddle/fluid/distributed/table/graph/graph_node.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/fluid/string/string_helper.h" +namespace paddle { +namespace distributed { + +std::vector GraphShard::get_batch(int start, int end, int step) { + if (start < 0) start = 0; + std::vector res; + for (int pos = start; pos < std::min(end, (int)bucket.size()); pos += step) { + res.push_back(bucket[pos]); + } + return res; +} + +size_t GraphShard::get_size() { return bucket.size(); } + +GraphNode *GraphShard::add_graph_node(uint64_t id) { + if (node_location.find(id) == node_location.end()) { + node_location[id] = bucket.size(); + bucket.push_back(new GraphNode(id)); + } + return (GraphNode *)bucket[node_location[id]]; +} + +FeatureNode *GraphShard::add_feature_node(uint64_t id) { + if (node_location.find(id) == node_location.end()) { + node_location[id] = bucket.size(); + bucket.push_back(new FeatureNode(id)); + } + return (FeatureNode *)bucket[node_location[id]]; +} + +void GraphShard::add_neighboor(uint64_t id, uint64_t dst_id, float weight) { + find_node(id)->add_edge(dst_id, weight); +} + +Node *GraphShard::find_node(uint64_t id) { + auto iter = node_location.find(id); + return iter == node_location.end() ? nullptr : bucket[iter->second]; +} + +int32_t GraphTable::load(const std::string &path, const std::string ¶m) { + bool load_edge = (param[0] == 'e'); + bool load_node = (param[0] == 'n'); + if (load_edge) { + bool reverse_edge = (param[1] == '<'); + return this->load_edges(path, reverse_edge); + } + if (load_node) { + std::string node_type = param.substr(1); + return this->load_nodes(path, node_type); + } + return 0; +} + +int32_t GraphTable::get_nodes_ids_by_ranges( + std::vector> ranges, std::vector &res) { + int start = 0, end, index = 0, total_size = 0; + res.clear(); + std::vector>> tasks; + // std::string temp = ""; + // for(int i = 0;i < shards.size();i++) + // temp+= std::to_string((int)shards[i].get_size()) + " "; + // VLOG(0)<<"range distribution "<= end) { + break; + } else { + int first = std::max(ranges[index].first, start); + int second = std::min(ranges[index].second, end); + start = second; + first -= total_size; + second -= total_size; + // VLOG(0)<<" FIND RANGE "<enqueue( + [this, first, second, i]() -> std::vector { + return shards[i].get_ids_by_range(first, second); + })); + } + } + total_size += shards[i].get_size(); + } + for (int i = 0; i < tasks.size(); i++) { + auto vec = tasks[i].get(); + for (auto &id : vec) { + res.push_back(id); + std::swap(res[rand() % res.size()], res[(int)res.size() - 1]); + } + } + return 0; +} + +int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) { + auto paths = paddle::string::split_string(path, ";"); + int64_t count = 0; + int64_t valid_count = 0; + for (auto path : paths) { + std::ifstream file(path); + std::string line; + while (std::getline(file, line)) { + count++; + auto values = paddle::string::split_string(line, "\t"); + if (values.size() < 2) continue; + auto id = std::stoull(values[1]); + + size_t shard_id = id % shard_num; + if (shard_id >= shard_end || shard_id < shard_start) { + VLOG(4) << "will not load " << id << " from " << path + << ", please check id distribution"; + continue; + } + + if (count % 1000000 == 0) { + VLOG(0) << count << " nodes are loaded from filepath"; + } + + std::string nt = values[0]; + if (nt != node_type) { + continue; + } + + size_t index = shard_id - shard_start; + + auto node = shards[index].add_feature_node(id); + + node->set_feature_size(feat_name.size()); + + for (size_t slice = 2; slice < values.size(); slice++) { + auto feat = this->parse_feature(values[slice]); + if (feat.first >= 0) { + node->set_feature(feat.first, feat.second); + } else { + VLOG(4) << "Node feature: " << values[slice] + << " not in feature_map."; + } + } + valid_count++; + } + } + + VLOG(0) << valid_count << "/" << count << " nodes in type " << node_type + << " are loaded successfully in " << path; + return 0; +} + +int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { + auto paths = paddle::string::split_string(path, ";"); + int64_t count = 0; + std::string sample_type = "random"; + bool is_weighted = false; + int valid_count = 0; + + for (auto path : paths) { + std::ifstream file(path); + std::string line; + while (std::getline(file, line)) { + auto values = paddle::string::split_string(line, "\t"); + count++; + if (values.size() < 2) continue; + auto src_id = std::stoull(values[0]); + auto dst_id = std::stoull(values[1]); + if (reverse_edge) { + std::swap(src_id, dst_id); + } + float weight = 1; + if (values.size() == 3) { + weight = std::stof(values[2]); + sample_type = "weighted"; + is_weighted = true; + } + + size_t src_shard_id = src_id % shard_num; + + if (src_shard_id >= shard_end || src_shard_id < shard_start) { + VLOG(4) << "will not load " << src_id << " from " << path + << ", please check id distribution"; + continue; + } + if (count % 1000000 == 0) { + VLOG(0) << count << " edges are loaded from filepath"; + } + + size_t index = src_shard_id - shard_start; + shards[index].add_graph_node(src_id)->build_edges(is_weighted); + shards[index].add_neighboor(src_id, dst_id, weight); + valid_count++; + } + } + VLOG(0) << valid_count << "/" << count << " edges are loaded successfully in " + << path; + + // Build Sampler j + + for (auto &shard : shards) { + auto bucket = shard.get_bucket(); + for (int i = 0; i < bucket.size(); i++) { + bucket[i]->build_sampler(sample_type); + } + } + return 0; +} + +Node *GraphTable::find_node(uint64_t id) { + size_t shard_id = id % shard_num; + if (shard_id >= shard_end || shard_id < shard_start) { + return nullptr; + } + size_t index = shard_id - shard_start; + Node *node = shards[index].find_node(id); + return node; +} +uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) { + return node_id % shard_num % shard_num_per_table % task_pool_size_; +} +int32_t GraphTable::random_sample_nodes(int sample_size, + std::unique_ptr &buffer, + int &actual_size) { + bool need_feature = false; + int total_size = 0; + for (int i = 0; i < shards.size(); i++) { + total_size += shards[i].get_size(); + } + if (sample_size > total_size) sample_size = total_size; + int range_num = random_sample_nodes_ranges; + if (range_num > sample_size) range_num = sample_size; + if (sample_size == 0 || range_num == 0) return 0; + std::vector ranges_len, ranges_pos; + int remain = sample_size, last_pos = -1, num; + std::set separator_set; + for (int i = 0; i < range_num - 1; i++) { + while (separator_set.find(num = rand() % (sample_size - 1)) != + separator_set.end()) + ; + separator_set.insert(num); + } + for (auto p : separator_set) { + ranges_len.push_back(p - last_pos); + last_pos = p; + } + ranges_len.push_back(sample_size - 1 - last_pos); + remain = total_size - sample_size + range_num; + separator_set.clear(); + for (int i = 0; i < range_num; i++) { + while (separator_set.find(num = rand() % remain) != separator_set.end()) + ; + separator_set.insert(num); + } + int used = 0, index = 0; + last_pos = -1; + for (auto p : separator_set) { + used += p - last_pos - 1; + last_pos = p; + ranges_pos.push_back(used); + used += ranges_len[index++]; + } + std::vector> first_half, second_half; + int start_index = rand() % total_size; + for (int i = 0; i < ranges_len.size() && i < ranges_pos.size(); i++) { + if (ranges_pos[i] + ranges_len[i] - 1 + start_index < total_size) + first_half.push_back({ranges_pos[i] + start_index, + ranges_pos[i] + ranges_len[i] + start_index}); + else if (ranges_pos[i] + start_index >= total_size) { + second_half.push_back( + {ranges_pos[i] + start_index - total_size, + ranges_pos[i] + ranges_len[i] + start_index - total_size}); + } else { + first_half.push_back({ranges_pos[i] + start_index, total_size}); + second_half.push_back( + {0, ranges_pos[i] + ranges_len[i] + start_index - total_size}); + } + } + for (auto &pair : first_half) second_half.push_back(pair); + std::vector res; + get_nodes_ids_by_ranges(second_half, res); + actual_size = res.size() * sizeof(uint64_t); + buffer.reset(new char[actual_size]); + char *pointer = buffer.get(); + memcpy(pointer, res.data(), actual_size); + return 0; +} +int32_t GraphTable::random_sample_neighboors( + uint64_t *node_ids, int sample_size, + std::vector> &buffers, + std::vector &actual_sizes) { + size_t node_num = buffers.size(); + std::vector> tasks; + for (size_t idx = 0; idx < node_num; ++idx) { + uint64_t &node_id = node_ids[idx]; + std::unique_ptr &buffer = buffers[idx]; + int &actual_size = actual_sizes[idx]; + tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue( + [&]() -> int { + Node *node = find_node(node_id); + + if (node == nullptr) { + actual_size = 0; + return 0; + } + std::vector res = node->sample_k(sample_size); + actual_size = res.size() * (Node::id_size + Node::weight_size); + int offset = 0; + uint64_t id; + float weight; + char *buffer_addr = new char[actual_size]; + buffer.reset(buffer_addr); + for (int &x : res) { + id = node->get_neighbor_id(x); + weight = node->get_neighbor_weight(x); + memcpy(buffer_addr + offset, &id, Node::id_size); + offset += Node::id_size; + memcpy(buffer_addr + offset, &weight, Node::weight_size); + offset += Node::weight_size; + } + return 0; + })); + } + for (size_t idx = 0; idx < node_num; ++idx) { + tasks[idx].get(); + } + return 0; +} + +int32_t GraphTable::get_node_feat(const std::vector &node_ids, + const std::vector &feature_names, + std::vector> &res) { + size_t node_num = node_ids.size(); + std::vector> tasks; + for (size_t idx = 0; idx < node_num; ++idx) { + uint64_t node_id = node_ids[idx]; + tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue( + [&, idx, node_id]() -> int { + Node *node = find_node(node_id); + + if (node == nullptr) { + return 0; + } + for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) { + const std::string &feature_name = feature_names[feat_idx]; + if (feat_id_map.find(feature_name) != feat_id_map.end()) { + // res[feat_idx][idx] = + // node->get_feature(feat_id_map[feature_name]); + auto feat = node->get_feature(feat_id_map[feature_name]); + res[feat_idx][idx] = feat; + } + } + return 0; + })); + } + for (size_t idx = 0; idx < node_num; ++idx) { + tasks[idx].get(); + } + return 0; +} + +std::pair GraphTable::parse_feature( + std::string feat_str) { + // Return (feat_id, btyes) if name are in this->feat_name, else return (-1, + // "") + auto fields = paddle::string::split_string(feat_str, " "); + if (this->feat_id_map.count(fields[0])) { + int32_t id = this->feat_id_map[fields[0]]; + std::string dtype = this->feat_dtype[id]; + int32_t shape = this->feat_shape[id]; + std::vector values(fields.begin() + 1, fields.end()); + if (dtype == "feasign") { + return std::make_pair( + int32_t(id), paddle::string::join_strings(values, ' ')); + } else if (dtype == "string") { + return std::make_pair( + int32_t(id), paddle::string::join_strings(values, ' ')); + } else if (dtype == "float32") { + return std::make_pair( + int32_t(id), FeatureNode::parse_value_to_bytes(values)); + } else if (dtype == "float64") { + return std::make_pair( + int32_t(id), FeatureNode::parse_value_to_bytes(values)); + } else if (dtype == "int32") { + return std::make_pair( + int32_t(id), FeatureNode::parse_value_to_bytes(values)); + } else if (dtype == "int64") { + return std::make_pair( + int32_t(id), FeatureNode::parse_value_to_bytes(values)); + } + } + return std::make_pair(-1, ""); +} + +int32_t GraphTable::pull_graph_list(int start, int total_size, + std::unique_ptr &buffer, + int &actual_size, bool need_feature, + int step) { + if (start < 0) start = 0; + int size = 0, cur_size; + std::vector>> tasks; + for (size_t i = 0; i < shards.size() && total_size > 0; i++) { + cur_size = shards[i].get_size(); + if (size + cur_size <= start) { + size += cur_size; + continue; + } + int count = std::min(1 + (size + cur_size - start - 1) / step, total_size); + int end = start + (count - 1) * step + 1; + tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( + [this, i, start, end, step, size]() -> std::vector { + + return this->shards[i].get_batch(start - size, end - size, step); + })); + start += count * step; + total_size -= count; + size += cur_size; + } + for (size_t i = 0; i < tasks.size(); ++i) { + tasks[i].wait(); + } + size = 0; + std::vector> res; + for (size_t i = 0; i < tasks.size(); i++) { + res.push_back(tasks[i].get()); + for (size_t j = 0; j < res.back().size(); j++) { + size += res.back()[j]->get_size(need_feature); + } + } + char *buffer_addr = new char[size]; + buffer.reset(buffer_addr); + int index = 0; + for (size_t i = 0; i < res.size(); i++) { + for (size_t j = 0; j < res[i].size(); j++) { + res[i][j]->to_buffer(buffer_addr + index, need_feature); + index += res[i][j]->get_size(need_feature); + } + } + actual_size = size; + return 0; +} +int32_t GraphTable::initialize() { + _shards_task_pool.resize(task_pool_size_); + for (size_t i = 0; i < _shards_task_pool.size(); ++i) { + _shards_task_pool[i].reset(new ::ThreadPool(1)); + } + server_num = _shard_num; + // VLOG(0) << "in init graph table server num = " << server_num; + /* + _shard_num is actually server number here + when a server initialize its tables, it sets tables' _shard_num to server_num, + and _shard_idx to server + rank + */ + auto common = _config.common(); + + this->table_name = common.table_name(); + this->table_type = common.name(); + VLOG(0) << " init graph table type " << this->table_type << " table name " + << this->table_name; + int feat_conf_size = static_cast(common.attributes().size()); + for (int i = 0; i < feat_conf_size; i++) { + auto &f_name = common.attributes()[i]; + auto &f_shape = common.dims()[i]; + auto &f_dtype = common.params()[i]; + this->feat_name.push_back(f_name); + this->feat_shape.push_back(f_shape); + this->feat_dtype.push_back(f_dtype); + this->feat_id_map[f_name] = i; + VLOG(0) << "init graph table feat conf name:" << f_name + << " shape:" << f_shape << " dtype:" << f_dtype; + } + + shard_num = _config.shard_num(); + VLOG(0) << "in init graph table shard num = " << shard_num << " shard_idx" + << _shard_idx; + shard_num_per_table = sparse_local_shard_num(shard_num, server_num); + shard_start = _shard_idx * shard_num_per_table; + shard_end = shard_start + shard_num_per_table; + VLOG(0) << "in init graph table shard idx = " << _shard_idx << " shard_start " + << shard_start << " shard_end " << shard_end; + // shards.resize(shard_num_per_table); + shards = std::vector(shard_num_per_table, GraphShard(shard_num)); + return 0; +} +} +}; diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h new file mode 100644 index 0000000000000000000000000000000000000000..b18da82abe61c9695712f542e187ac48fd5edc9d --- /dev/null +++ b/paddle/fluid/distributed/table/common_graph_table.h @@ -0,0 +1,131 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include // NOLINT +#include +#include +#include +#include +#include "paddle/fluid/distributed/table/accessor.h" +#include "paddle/fluid/distributed/table/common_table.h" +#include "paddle/fluid/distributed/table/graph/graph_node.h" +#include "paddle/fluid/framework/rw_lock.h" +#include "paddle/fluid/string/string_helper.h" +namespace paddle { +namespace distributed { +class GraphShard { + public: + size_t get_size(); + GraphShard() {} + GraphShard(int shard_num) { this->shard_num = shard_num; } + std::vector &get_bucket() { return bucket; } + std::vector get_batch(int start, int end, int step); + std::vector get_ids_by_range(int start, int end) { + std::vector res; + for (int i = start; i < end && i < bucket.size(); i++) { + res.push_back(bucket[i]->get_id()); + } + return res; + } + GraphNode *add_graph_node(uint64_t id); + FeatureNode *add_feature_node(uint64_t id); + Node *find_node(uint64_t id); + void add_neighboor(uint64_t id, uint64_t dst_id, float weight); + std::unordered_map get_node_location() { + return node_location; + } + + private: + std::unordered_map node_location; + int shard_num; + std::vector bucket; +}; +class GraphTable : public SparseTable { + public: + GraphTable() {} + virtual ~GraphTable() {} + virtual int32_t pull_graph_list(int start, int size, + std::unique_ptr &buffer, + int &actual_size, bool need_feature, + int step); + + virtual int32_t random_sample_neighboors( + uint64_t *node_ids, int sample_size, + std::vector> &buffers, + std::vector &actual_sizes); + + int32_t random_sample_nodes(int sample_size, std::unique_ptr &buffers, + int &actual_sizes); + + virtual int32_t get_nodes_ids_by_ranges( + std::vector> ranges, std::vector &res); + virtual int32_t initialize(); + + int32_t load(const std::string &path, const std::string ¶m); + + int32_t load_edges(const std::string &path, bool reverse); + + int32_t load_nodes(const std::string &path, std::string node_type); + + Node *find_node(uint64_t id); + + virtual int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) { + return 0; + } + + virtual int32_t push_sparse(const uint64_t *keys, const float *values, + size_t num) { + return 0; + } + + virtual void clear() {} + virtual int32_t flush() { return 0; } + virtual int32_t shrink(const std::string ¶m) { return 0; } + //指定保存路径 + virtual int32_t save(const std::string &path, const std::string &converter) { + return 0; + } + virtual int32_t initialize_shard() { return 0; } + virtual uint32_t get_thread_pool_index(uint64_t node_id); + virtual std::pair parse_feature(std::string feat_str); + + virtual int32_t get_node_feat(const std::vector &node_ids, + const std::vector &feature_names, + std::vector> &res); + + protected: + std::vector shards; + size_t shard_start, shard_end, server_num, shard_num_per_table, shard_num; + const int task_pool_size_ = 24; + const int random_sample_nodes_ranges = 3; + + std::vector feat_name; + std::vector feat_dtype; + std::vector feat_shape; + std::unordered_map feat_id_map; + std::string table_name; + std::string table_type; + + std::vector> _shards_task_pool; +}; +} // namespace distributed +}; // namespace paddle diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc index ffedbea14a0290730b9d785464a84e3c4536a9e7..718fce9950719fb99e9831bad9490610ec3834cf 100644 --- a/paddle/fluid/distributed/table/common_sparse_table.cc +++ b/paddle/fluid/distributed/table/common_sparse_table.cc @@ -125,34 +125,37 @@ void ProcessALine(const std::vector& columns, const Meta& meta, int64_t SaveToText(std::ostream* os, std::shared_ptr block, const int mode) { - int64_t not_save_num = 0; - for (auto value : block->values_) { - if (mode == SaveMode::delta && !value.second->need_save_) { - not_save_num++; - continue; - } - - auto* vs = value.second->data_.data(); - std::stringstream ss; - auto id = value.first; - ss << id << "\t" << value.second->count_ << "\t" - << value.second->unseen_days_ << "\t" << value.second->is_entry_ << "\t"; - - for (int i = 0; i < block->value_length_; i++) { - ss << vs[i]; - ss << ","; - } + int64_t save_num = 0; + for (auto& table : block->values_) { + for (auto& value : table) { + if (mode == SaveMode::delta && !value.second->need_save_) { + continue; + } + save_num += 1; + + auto* vs = value.second->data_.data(); + std::stringstream ss; + auto id = value.first; + ss << id << "\t" << value.second->count_ << "\t" + << value.second->unseen_days_ << "\t" << value.second->is_entry_ + << "\t"; + + for (int i = 0; i < block->value_length_; i++) { + ss << vs[i]; + ss << ","; + } - ss << "\n"; + ss << "\n"; - os->write(ss.str().c_str(), sizeof(char) * ss.str().size()); + os->write(ss.str().c_str(), sizeof(char) * ss.str().size()); - if (mode == SaveMode::base || mode == SaveMode::delta) { - value.second->need_save_ = false; + if (mode == SaveMode::base || mode == SaveMode::delta) { + value.second->need_save_ = false; + } } } - return block->values_.size() - not_save_num; + return save_num; } int64_t LoadFromText(const std::string& valuepath, const std::string& metapath, @@ -183,7 +186,7 @@ int64_t LoadFromText(const std::string& valuepath, const std::string& metapath, block->Init(id, false); - auto value_instant = block->GetValue(id); + VALUE* value_instant = block->GetValue(id); if (values.size() == 5) { value_instant->count_ = std::stoi(values[1]); value_instant->unseen_days_ = std::stoi(values[2]); @@ -254,7 +257,6 @@ int32_t CommonSparseTable::initialize_value() { } auto accessor = _config.accessor(); - std::vector feasigns; for (size_t x = 0; x < accessor.fea_dim(); ++x) { @@ -271,9 +273,14 @@ int32_t CommonSparseTable::initialize_value() { std::vector ids(bucket_feasigns); std::copy(feasigns.begin() + buckets[x], feasigns.begin() + buckets[x + 1], ids.begin()); + + std::vector fres; + fres.resize(ids.size(), 1); + + auto pull_value = PullSparseValue(ids, fres, param_dim_); std::vector pulls; pulls.resize(bucket_feasigns * param_dim_); - pull_sparse(pulls.data(), ids.data(), bucket_feasigns); + pull_sparse(pulls.data(), pull_value); } return 0; @@ -369,8 +376,10 @@ std::pair CommonSparseTable::print_table_stat() { int64_t feasign_size = 0; int64_t mf_size = 0; - for (auto& value : shard_values_) { - feasign_size += value->values_.size(); + for (auto& shard : shard_values_) { + for (auto& table : shard->values_) { + feasign_size += table.size(); + } } return {feasign_size, mf_size}; @@ -399,10 +408,51 @@ int32_t CommonSparseTable::pour() { return 0; } -int32_t CommonSparseTable::pull_sparse(float* pull_values, const uint64_t* keys, - size_t num) { +int32_t CommonSparseTable::pull_sparse(float* pull_values, + const PullSparseValue& pull_value) { rwlock_->RDLock(); + auto shard_num = task_pool_size_; + std::vector> tasks(shard_num); + + for (int shard_id = 0; shard_id < shard_num; ++shard_id) { + tasks[shard_id] = _shards_task_pool[shard_id]->enqueue( + [this, shard_id, shard_num, &pull_value, &pull_values]() -> int { + auto& block = shard_values_[shard_id]; + + std::vector offsets; + pull_value.Fission(shard_id, shard_num, &offsets); + + if (pull_value.is_training_) { + for (auto& offset : offsets) { + auto feasign = pull_value.feasigns_[offset]; + auto frequencie = pull_value.frequencies_[offset]; + auto* value = block->Init(feasign, true, frequencie); + std::copy_n(value + param_offset_, param_dim_, + pull_values + param_dim_ * offset); + } + } else { + for (auto& offset : offsets) { + auto feasign = pull_value.feasigns_[offset]; + auto* value = block->Init(feasign, false); + std::copy_n(value + param_offset_, param_dim_, + pull_values + param_dim_ * offset); + } + } + + return 0; + }); + } + + for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) { + tasks[shard_id].wait(); + } + rwlock_->UNLock(); + return 0; +} + +int32_t CommonSparseTable::pull_sparse_ptr(char** pull_values, + const uint64_t* keys, size_t num) { std::vector> offset_bucket; offset_bucket.resize(task_pool_size_); @@ -422,9 +472,10 @@ int32_t CommonSparseTable::pull_sparse(float* pull_values, const uint64_t* keys, for (int i = 0; i < offsets.size(); ++i) { auto offset = offsets[i]; auto id = keys[offset]; - auto* value = block->Init(id); - std::copy_n(value + param_offset_, param_dim_, - pull_values + param_dim_ * offset); + auto* value = block->InitGet(id); + // std::copy_n(value + param_offset_, param_dim_, + // pull_values + param_dim_ * offset); + pull_values[offset] = (char*)value; } return 0; @@ -434,7 +485,6 @@ int32_t CommonSparseTable::pull_sparse(float* pull_values, const uint64_t* keys, for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) { tasks[shard_id].wait(); } - rwlock_->UNLock(); return 0; } @@ -494,6 +544,45 @@ int32_t CommonSparseTable::push_sparse(const uint64_t* keys, return 0; } +int32_t CommonSparseTable::push_sparse(const uint64_t* keys, + const float** values, size_t num) { + _push_sparse(keys, values, num); + return 0; +} + +int32_t CommonSparseTable::_push_sparse(const uint64_t* keys, + const float** values, size_t num) { + rwlock_->RDLock(); + std::vector> offset_bucket; + offset_bucket.resize(task_pool_size_); + + for (int x = 0; x < num; ++x) { + auto y = keys[x] % task_pool_size_; + offset_bucket[y].push_back(x); + } + + std::vector> tasks(task_pool_size_); + + for (int shard_id = 0; shard_id < task_pool_size_; ++shard_id) { + tasks[shard_id] = _shards_task_pool[shard_id]->enqueue( + [this, shard_id, &keys, &values, num, &offset_bucket]() -> int { + auto& offsets = offset_bucket[shard_id]; + for (size_t i = 0; i < offsets.size(); ++i) { + std::vector tmp_off = {0}; + optimizer_->update(keys + offsets[i], values[offsets[i]], num, + tmp_off, shard_values_[shard_id].get()); + } + return 0; + }); + } + + for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) { + tasks[shard_id].wait(); + } + rwlock_->UNLock(); + return 0; +} + int32_t CommonSparseTable::push_sparse_param(const uint64_t* keys, const float* values, size_t num) { rwlock_->RDLock(); diff --git a/paddle/fluid/distributed/table/common_sparse_table.h b/paddle/fluid/distributed/table/common_sparse_table.h index 98cbf2b4a21057f64a5d510158907df1de393925..50c295da53464c8cc1589b27a6dbc233367991b4 100644 --- a/paddle/fluid/distributed/table/common_sparse_table.h +++ b/paddle/fluid/distributed/table/common_sparse_table.h @@ -61,12 +61,17 @@ class CommonSparseTable : public SparseTable { int32_t save(const std::string& path, const std::string& param); virtual std::pair print_table_stat(); - virtual int32_t pull_sparse(float* pull_values, const uint64_t* keys, - size_t num); + virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value); + + virtual int32_t pull_sparse_ptr(char** pull_values, const uint64_t* keys, + size_t num); virtual int32_t push_sparse(const uint64_t* keys, const float* values, size_t num); + virtual int32_t push_sparse(const uint64_t* keys, const float** values, + size_t num); + // only for sparse geo table virtual int32_t push_sparse_param(const uint64_t* keys, const float* values, size_t num); @@ -81,6 +86,8 @@ class CommonSparseTable : public SparseTable { protected: virtual int32_t _push_sparse(const uint64_t* keys, const float* values, size_t num); + virtual int32_t _push_sparse(const uint64_t* keys, const float** values, + size_t num); private: const int task_pool_size_ = 11; diff --git a/paddle/fluid/distributed/table/common_table.h b/paddle/fluid/distributed/table/common_table.h index dc3cfa75ff689863773e88ef2d077b80c1f0a5d5..bc7f17f5f245794cebf96a8a4bc69e0dce8ac997 100644 --- a/paddle/fluid/distributed/table/common_table.h +++ b/paddle/fluid/distributed/table/common_table.h @@ -98,8 +98,8 @@ class DenseTable : public Table { virtual ~DenseTable() {} virtual void *get_shard(size_t shard_idx) { return 0; } - int32_t pull_sparse(float *values, const uint64_t *keys, - size_t num) override { + int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) override { return 0; } int32_t push_sparse(const uint64_t *keys, const float *values, @@ -123,8 +123,8 @@ class BarrierTable : public Table { int32_t push_dense(const float *values, size_t num) override { return 0; } - int32_t pull_sparse(float *values, const uint64_t *keys, - size_t num) override { + int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) override { return 0; } int32_t push_sparse(const uint64_t *keys, const float *values, diff --git a/paddle/fluid/distributed/table/depends/dense.h b/paddle/fluid/distributed/table/depends/dense.h index a2acdfd20148ac282f6633e55ea450dd3367e5f2..8079003d1bf8f677aeaba91f8860504adcb853e0 100644 --- a/paddle/fluid/distributed/table/depends/dense.h +++ b/paddle/fluid/distributed/table/depends/dense.h @@ -89,7 +89,6 @@ class DSGD : public DenseOptimizer { auto blas = GetBlas(); float lr = *(global_learning_rate_) * (*learning_rate); - VLOG(4) << "DSGD LearningRate: " << lr; blas.VCOPY(update_numel, update_values + begin, grads.data()); blas.SCAL(update_numel, lr, grads.data()); blas.VSUB(update_numel, param + begin, grads.data(), param + begin); @@ -157,7 +156,6 @@ class DAdam : public DenseOptimizer { beta2_pow[0] = beta2_pow[0] * beta2; float lr_ = *(global_learning_rate_)*learning_rate[0]; - VLOG(4) << "DAdam LearningRate: " << lr_; lr_ *= sqrt(1 - beta2_pow[0]) / (1 - beta1_pow[0]); float* tmp_ = tmp.data(); diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/table/depends/large_scale_kv.h index ba79a381a6d881fdc153ad0e04e0ee436120b179..5c10fca98cda4d6cbdcb430ab5f2b8016a6ff7f2 100644 --- a/paddle/fluid/distributed/table/depends/large_scale_kv.h +++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h @@ -26,8 +26,10 @@ #include #include "gflags/gflags.h" +#include "butil/object_pool.h" #include "paddle/fluid/distributed/common/utils.h" #include "paddle/fluid/distributed/table/depends/initializers.h" +#include "paddle/fluid/distributed/thirdparty/round_robin.h" #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/rw_lock.h" @@ -47,6 +49,10 @@ namespace distributed { enum Mode { training, infer }; +static const int SPARSE_SHARD_BUCKET_NUM_BITS = 6; +static const size_t SPARSE_SHARD_BUCKET_NUM = (size_t)1 + << SPARSE_SHARD_BUCKET_NUM_BITS; + struct VALUE { explicit VALUE(size_t length) : length_(length), @@ -66,11 +72,11 @@ struct VALUE { bool is_entry_; // whether knock-in }; -inline bool count_entry(std::shared_ptr value, int threshold) { +inline bool count_entry(VALUE *value, int threshold) { return value->count_ >= threshold; } -inline bool probility_entry(std::shared_ptr value, float threshold) { +inline bool probility_entry(VALUE *value, float threshold) { UniformInitializer uniform = UniformInitializer({"uniform", "0", "0", "1"}); return uniform.GetValue() >= threshold; } @@ -87,7 +93,7 @@ class ValueBlock { value_dims_(value_dims), value_offsets_(value_offsets), value_idx_(value_idx) { - for (int x = 0; x < value_dims.size(); ++x) { + for (size_t x = 0; x < value_dims.size(); ++x) { value_length_ += value_dims[x]; } @@ -96,13 +102,15 @@ class ValueBlock { auto slices = string::split_string(entry_attr, ":"); if (slices[0] == "none") { entry_func_ = std::bind(&count_entry, std::placeholders::_1, 0); + threshold_ = 0; } else if (slices[0] == "count_filter_entry") { - int threshold = std::stoi(slices[1]); - entry_func_ = std::bind(&count_entry, std::placeholders::_1, threshold); + threshold_ = std::stoi(slices[1]); + entry_func_ = + std::bind(&count_entry, std::placeholders::_1, threshold_); } else if (slices[0] == "probability_entry") { - float threshold = std::stof(slices[1]); + threshold_ = std::stof(slices[1]); entry_func_ = - std::bind(&probility_entry, std::placeholders::_1, threshold); + std::bind(&probility_entry, std::placeholders::_1, threshold_); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Not supported Entry Type : %s, Only support [CountFilterEntry, " @@ -143,7 +151,7 @@ class ValueBlock { const std::vector &value_dims) { auto pts = std::vector(); pts.reserve(value_names.size()); - auto &values = values_.at(id); + auto values = GetValue(id); for (int i = 0; i < static_cast(value_names.size()); i++) { PADDLE_ENFORCE_EQ( value_dims[i], value_dims_[i], @@ -155,30 +163,59 @@ class ValueBlock { } // pull - float *Init(const uint64_t &id, const bool with_update = true) { - if (!Has(id)) { - values_[id] = std::make_shared(value_length_); - } + float *Init(const uint64_t &id, const bool with_update = true, + const int counter = 1) { + size_t hash = _hasher(id); + size_t bucket = compute_bucket(hash); - auto &value = values_.at(id); + auto &table = values_[bucket]; + auto res = table.find(id); - if (with_update) { - AttrUpdate(value); + VALUE *value = nullptr; + if (res == table.end()) { + value = butil::get_object(value_length_); + + table[id] = value; + + } else { + value = res->second; } + if (with_update) { + AttrUpdate(value, counter); + } return value->data_.data(); } - void AttrUpdate(std::shared_ptr value) { + VALUE *InitGet(const uint64_t &id, const bool with_update = true, + const int counter = 1) { + size_t hash = _hasher(id); + size_t bucket = compute_bucket(hash); + + auto &table = values_[bucket]; + auto res = table.find(id); + + VALUE *value = nullptr; + if (res == table.end()) { + value = butil::get_object(value_length_); + // value = _alloc.acquire(value_length_); + table[id] = value; + } else { + value = (VALUE *)(void *)(res->second); + } + return value; + } + + void AttrUpdate(VALUE *value, const int counter) { // update state value->unseen_days_ = 0; - ++value->count_; + value->count_ += counter; if (!value->is_entry_) { value->is_entry_ = entry_func_(value); if (value->is_entry_) { // initialize - for (int x = 0; x < value_names_.size(); ++x) { + for (size_t x = 0; x < value_names_.size(); ++x) { initializers_[x]->GetValue(value->data_.data() + value_offsets_[x], value_dims_[x]); } @@ -193,40 +230,73 @@ class ValueBlock { // dont jude if (has(id)) float *Get(const uint64_t &id) { - auto &value = values_.at(id); + size_t hash = _hasher(id); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + // auto &value = table.at(id); + // return value->data_.data(); + auto res = table.find(id); + VALUE *value = res->second; return value->data_.data(); } // for load, to reset count, unseen_days - std::shared_ptr GetValue(const uint64_t &id) { return values_.at(id); } + VALUE *GetValue(const uint64_t &id) { + size_t hash = _hasher(id); + size_t bucket = compute_bucket(hash); + + auto &table = values_[bucket]; + auto res = table.find(id); + return res->second; + } bool GetEntry(const uint64_t &id) { - auto &value = values_.at(id); + auto value = GetValue(id); return value->is_entry_; } void SetEntry(const uint64_t &id, const bool state) { - auto &value = values_.at(id); + auto value = GetValue(id); value->is_entry_ = state; } void Shrink(const int threshold) { - for (auto iter = values_.begin(); iter != values_.end();) { - auto &value = iter->second; - value->unseen_days_++; - if (value->unseen_days_ >= threshold) { - iter = values_.erase(iter); - } else { - ++iter; + for (auto &table : values_) { + for (auto iter = table.begin(); iter != table.end();) { + // VALUE* value = (VALUE*)(void*)(iter->second); + VALUE *value = iter->second; + value->unseen_days_++; + if (value->unseen_days_ >= threshold) { + butil::return_object(iter->second); + //_alloc.release(iter->second); + //_alloc.release(value); + iter = table.erase(iter); + } else { + ++iter; + } } } return; } + float GetThreshold() { return threshold_; } + size_t compute_bucket(size_t hash) { + if (SPARSE_SHARD_BUCKET_NUM == 1) { + return 0; + } else { + return hash >> (sizeof(size_t) * 8 - SPARSE_SHARD_BUCKET_NUM_BITS); + } + } + private: bool Has(const uint64_t id) { - auto got = values_.find(id); - if (got == values_.end()) { + size_t hash = _hasher(id); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + auto got = table.find(id); + if (got == table.end()) { return false; } else { return true; @@ -234,8 +304,9 @@ class ValueBlock { } public: - std::unordered_map> values_; + robin_hood::unordered_map values_[SPARSE_SHARD_BUCKET_NUM]; size_t value_length_ = 0; + std::hash _hasher; private: const std::vector &value_names_; @@ -243,8 +314,9 @@ class ValueBlock { const std::vector &value_offsets_; const std::unordered_map &value_idx_; - std::function)> entry_func_; + std::function entry_func_; std::vector> initializers_; + float threshold_; }; } // namespace distributed diff --git a/paddle/fluid/distributed/table/depends/sparse.h b/paddle/fluid/distributed/table/depends/sparse.h index 672d6e7d396874b5cc5a296f15e3842a3233410b..0e1d7ef03c129c2dc6f72d6e56fafb143d879bd4 100644 --- a/paddle/fluid/distributed/table/depends/sparse.h +++ b/paddle/fluid/distributed/table/depends/sparse.h @@ -110,7 +110,6 @@ class SSGD : public SparseOptimizer { auto* value = block->Get(id); float learning_rate = *(global_learning_rate_) * (value + lr_offset)[0]; - VLOG(4) << "SSGD LearningRate: " << learning_rate; float* param = value + param_offset; std::vector grads; @@ -166,7 +165,6 @@ class SAdam : public SparseOptimizer { if (!block->GetEntry(id)) continue; auto* values = block->Get(id); float lr_ = *(global_learning_rate_) * (values + lr_offset)[0]; - VLOG(4) << "SAdam LearningRate: " << lr_; float* param = values + param_offset; float* moment1 = values + m1_offset; float* moment2 = values + m2_offset; diff --git a/paddle/fluid/distributed/table/depends/sparse_utils.h b/paddle/fluid/distributed/table/depends/sparse_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..c185dd17d792e4715ae884e66c412aa5f24f809f --- /dev/null +++ b/paddle/fluid/distributed/table/depends/sparse_utils.h @@ -0,0 +1,74 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +namespace paddle { +namespace distributed { + +struct PullSparseValue { + explicit PullSparseValue(int numel, int dim) + : numel_(numel), + dim_(dim), + is_training_(true), + feasigns_(nullptr), + frequencies_(nullptr) {} + + explicit PullSparseValue(std::vector feasigns, + std::vector frequencies, int dim) { + numel_ = feasigns.size(); + dim_ = dim; + is_training_ = true; + feasigns_ = feasigns.data(); + frequencies_ = frequencies.data(); + } + + void DeserializeFromBytes(void* bytes) { + /* + |---isTraining--------------| + |---8*{num}B(keysData)------| + |---4*{num}B(Frequencies)---| + */ + auto* begin = reinterpret_cast(bytes); + is_training_ = reinterpret_cast(begin)[0]; + feasigns_ = reinterpret_cast(begin + sizeof(bool)); + frequencies_ = reinterpret_cast(begin + sizeof(bool) + + sizeof(uint64_t) * numel_); + } + + void Fission(const int shard_id, const int shard_num, + std::vector* offset_shard) const { + offset_shard->reserve(numel_ / shard_num + 1); + for (int x = 0; x < numel_; ++x) { + if (feasigns_[x] % shard_num == shard_id) { + offset_shard->push_back(x); + } + } + } + + int numel_; + int dim_; + bool is_training_; + uint64_t* feasigns_; + uint32_t* frequencies_; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/table/graph/graph_edge.cc b/paddle/fluid/distributed/table/graph/graph_edge.cc new file mode 100644 index 0000000000000000000000000000000000000000..0ab0d5a76d6715401dd55ce7487634b72d452ddf --- /dev/null +++ b/paddle/fluid/distributed/table/graph/graph_edge.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/graph/graph_edge.h" +#include +namespace paddle { +namespace distributed { + +void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) { + id_arr.push_back(id); +} + +void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) { + id_arr.push_back(id); + weight_arr.push_back(weight); +} +} +} diff --git a/paddle/fluid/distributed/table/graph/graph_edge.h b/paddle/fluid/distributed/table/graph/graph_edge.h new file mode 100644 index 0000000000000000000000000000000000000000..3dfe5a6f357a7cd7d79834a20b6411995665f4fa --- /dev/null +++ b/paddle/fluid/distributed/table/graph/graph_edge.h @@ -0,0 +1,46 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +namespace paddle { +namespace distributed { + +class GraphEdgeBlob { + public: + GraphEdgeBlob() {} + virtual ~GraphEdgeBlob() {} + size_t size() { return id_arr.size(); } + virtual void add_edge(uint64_t id, float weight); + uint64_t get_id(int idx) { return id_arr[idx]; } + virtual float get_weight(int idx) { return 1; } + + protected: + std::vector id_arr; +}; + +class WeightedGraphEdgeBlob : public GraphEdgeBlob { + public: + WeightedGraphEdgeBlob() {} + virtual ~WeightedGraphEdgeBlob() {} + virtual void add_edge(uint64_t id, float weight); + virtual float get_weight(int idx) { return weight_arr[idx]; } + + protected: + std::vector weight_arr; +}; +} +} diff --git a/paddle/fluid/distributed/table/graph/graph_node.cc b/paddle/fluid/distributed/table/graph/graph_node.cc new file mode 100644 index 0000000000000000000000000000000000000000..816d31b979072c3f1679df1ea75cd9dc75c55b0a --- /dev/null +++ b/paddle/fluid/distributed/table/graph/graph_node.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/graph/graph_node.h" +#include +namespace paddle { +namespace distributed { + +GraphNode::~GraphNode() { + if (sampler != nullptr) { + delete sampler; + sampler = nullptr; + } + if (edges != nullptr) { + delete edges; + edges = nullptr; + } +} + +int Node::weight_size = sizeof(float); +int Node::id_size = sizeof(uint64_t); +int Node::int_size = sizeof(int); + +int Node::get_size(bool need_feature) { return id_size + int_size; } + +void Node::to_buffer(char* buffer, bool need_feature) { + memcpy(buffer, &id, id_size); + buffer += id_size; + + int feat_num = 0; + memcpy(buffer, &feat_num, sizeof(int)); +} + +void Node::recover_from_buffer(char* buffer) { memcpy(&id, buffer, id_size); } + +int FeatureNode::get_size(bool need_feature) { + int size = id_size + int_size; // id, feat_num + if (need_feature) { + size += feature.size() * int_size; + for (const std::string& fea : feature) { + size += fea.size(); + } + } + return size; +} + +void GraphNode::build_edges(bool is_weighted) { + if (edges == nullptr) { + if (is_weighted == true) { + edges = new WeightedGraphEdgeBlob(); + } else { + edges = new GraphEdgeBlob(); + } + } +} +void GraphNode::build_sampler(std::string sample_type) { + if (sample_type == "random") { + sampler = new RandomSampler(); + } else if (sample_type == "weighted") { + sampler = new WeightedSampler(); + } + sampler->build(edges); +} +void FeatureNode::to_buffer(char* buffer, bool need_feature) { + memcpy(buffer, &id, id_size); + buffer += id_size; + + int feat_num = 0; + int feat_len; + if (need_feature) { + feat_num += feature.size(); + memcpy(buffer, &feat_num, sizeof(int)); + buffer += sizeof(int); + for (int i = 0; i < feat_num; ++i) { + feat_len = feature[i].size(); + memcpy(buffer, &feat_len, sizeof(int)); + buffer += sizeof(int); + memcpy(buffer, feature[i].c_str(), feature[i].size()); + buffer += feature[i].size(); + } + } else { + memcpy(buffer, &feat_num, sizeof(int)); + } +} +void FeatureNode::recover_from_buffer(char* buffer) { + int feat_num, feat_len; + memcpy(&id, buffer, id_size); + buffer += id_size; + + memcpy(&feat_num, buffer, sizeof(int)); + buffer += sizeof(int); + + feature.clear(); + for (int i = 0; i < feat_num; ++i) { + memcpy(&feat_len, buffer, sizeof(int)); + buffer += sizeof(int); + + char str[feat_len + 1]; + memcpy(str, buffer, feat_len); + buffer += feat_len; + str[feat_len] = '\0'; + feature.push_back(std::string(str)); + } +} +} +} diff --git a/paddle/fluid/distributed/table/graph/graph_node.h b/paddle/fluid/distributed/table/graph/graph_node.h new file mode 100644 index 0000000000000000000000000000000000000000..8ad795ac97b5499c7b10361760f7ac16494c154b --- /dev/null +++ b/paddle/fluid/distributed/table/graph/graph_node.h @@ -0,0 +1,127 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/distributed/table/graph/graph_weighted_sampler.h" +namespace paddle { +namespace distributed { + +class Node { + public: + Node() {} + Node(uint64_t id) : id(id) {} + virtual ~Node() {} + static int id_size, int_size, weight_size; + uint64_t get_id() { return id; } + void set_id(uint64_t id) { this->id = id; } + + virtual void build_edges(bool is_weighted) {} + virtual void build_sampler(std::string sample_type) {} + virtual void add_edge(uint64_t id, float weight) {} + virtual std::vector sample_k(int k) { return std::vector(); } + virtual uint64_t get_neighbor_id(int idx) { return 0; } + virtual float get_neighbor_weight(int idx) { return 1.; } + + virtual int get_size(bool need_feature); + virtual void to_buffer(char *buffer, bool need_feature); + virtual void recover_from_buffer(char *buffer); + virtual std::string get_feature(int idx) { return std::string(""); } + virtual void set_feature(int idx, std::string str) {} + virtual void set_feature_size(int size) {} + virtual int get_feature_size() { return 0; } + + protected: + uint64_t id; +}; + +class GraphNode : public Node { + public: + GraphNode() : Node(), sampler(nullptr), edges(nullptr) {} + GraphNode(uint64_t id) : Node(id), sampler(nullptr), edges(nullptr) {} + virtual ~GraphNode(); + virtual void build_edges(bool is_weighted); + virtual void build_sampler(std::string sample_type); + virtual void add_edge(uint64_t id, float weight) { + edges->add_edge(id, weight); + } + virtual std::vector sample_k(int k) { return sampler->sample_k(k); } + virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); } + virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); } + + protected: + Sampler *sampler; + GraphEdgeBlob *edges; +}; + +class FeatureNode : public Node { + public: + FeatureNode() : Node() {} + FeatureNode(uint64_t id) : Node(id) {} + virtual ~FeatureNode() {} + virtual int get_size(bool need_feature); + virtual void to_buffer(char *buffer, bool need_feature); + virtual void recover_from_buffer(char *buffer); + virtual std::string get_feature(int idx) { + if (idx < (int)this->feature.size()) { + return this->feature[idx]; + } else { + return std::string(""); + } + } + + virtual void set_feature(int idx, std::string str) { + if (idx >= (int)this->feature.size()) { + this->feature.resize(idx + 1); + } + this->feature[idx] = str; + } + virtual void set_feature_size(int size) { this->feature.resize(size); } + virtual int get_feature_size() { return this->feature.size(); } + + template + static std::string parse_value_to_bytes(std::vector feat_str) { + T v; + size_t Tsize = sizeof(T) * feat_str.size(); + char buffer[Tsize]; + for (size_t i = 0; i < feat_str.size(); i++) { + std::stringstream ss(feat_str[i]); + ss >> v; + std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T)); + } + return std::string(buffer, Tsize); + } + + template + static std::vector parse_bytes_to_array(std::string feat_str) { + T v; + std::vector out; + size_t start = 0; + const char *buffer = feat_str.data(); + while (start < feat_str.size()) { + std::memcpy((char *)&v, buffer + start, sizeof(T)); + start += sizeof(T); + out.push_back(v); + } + return out; + } + + protected: + std::vector feature; +}; +} +} diff --git a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc new file mode 100644 index 0000000000000000000000000000000000000000..3a680875e3df4a9cd60f8fe1921b877dbb23c8a2 --- /dev/null +++ b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc @@ -0,0 +1,150 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/graph/graph_weighted_sampler.h" +#include +#include +namespace paddle { +namespace distributed { + +void RandomSampler::build(GraphEdgeBlob *edges) { this->edges = edges; } + +std::vector RandomSampler::sample_k(int k) { + int n = edges->size(); + if (k > n) { + k = n; + } + struct timespec tn; + clock_gettime(CLOCK_REALTIME, &tn); + srand(tn.tv_nsec); + std::vector sample_result; + std::unordered_map replace_map; + while (k--) { + int rand_int = rand() % n; + auto iter = replace_map.find(rand_int); + if (iter == replace_map.end()) { + sample_result.push_back(rand_int); + } else { + sample_result.push_back(iter->second); + } + + iter = replace_map.find(n - 1); + if (iter == replace_map.end()) { + replace_map[rand_int] = n - 1; + } else { + replace_map[rand_int] = iter->second; + } + --n; + } + return sample_result; +} + +WeightedSampler::WeightedSampler() { + left = nullptr; + right = nullptr; + edges = nullptr; +} + +WeightedSampler::~WeightedSampler() { + if (left != nullptr) { + delete left; + left = nullptr; + } + if (right != nullptr) { + delete right; + right = nullptr; + } +} + +void WeightedSampler::build(GraphEdgeBlob *edges) { + if (left != nullptr) { + delete left; + left = nullptr; + } + if (right != nullptr) { + delete right; + right = nullptr; + } + return build_one((WeightedGraphEdgeBlob *)edges, 0, edges->size()); +} + +void WeightedSampler::build_one(WeightedGraphEdgeBlob *edges, int start, + int end) { + count = 0; + this->edges = edges; + if (start + 1 == end) { + left = right = nullptr; + idx = start; + count = 1; + weight = edges->get_weight(idx); + + } else { + left = new WeightedSampler(); + right = new WeightedSampler(); + left->build_one(edges, start, start + (end - start) / 2); + right->build_one(edges, start + (end - start) / 2, end); + weight = left->weight + right->weight; + count = left->count + right->count; + } +} +std::vector WeightedSampler::sample_k(int k) { + if (k > count) { + k = count; + } + std::vector sample_result; + float subtract; + std::unordered_map subtract_weight_map; + std::unordered_map subtract_count_map; + struct timespec tn; + clock_gettime(CLOCK_REALTIME, &tn); + srand(tn.tv_nsec); + while (k--) { + float query_weight = rand() % 100000 / 100000.0; + query_weight *= weight - subtract_weight_map[this]; + sample_result.push_back(sample(query_weight, subtract_weight_map, + subtract_count_map, subtract)); + } + return sample_result; +} + +int WeightedSampler::sample( + float query_weight, + std::unordered_map &subtract_weight_map, + std::unordered_map &subtract_count_map, + float &subtract) { + if (left == nullptr) { + subtract_weight_map[this] = weight; + subtract = weight; + subtract_count_map[this] = 1; + return idx; + } + int left_count = left->count - subtract_count_map[left]; + int right_count = right->count - subtract_count_map[right]; + float left_subtract = subtract_weight_map[left]; + int return_idx; + if (right_count == 0 || + left_count > 0 && left->weight - left_subtract >= query_weight) { + return_idx = left->sample(query_weight, subtract_weight_map, + subtract_count_map, subtract); + } else { + return_idx = + right->sample(query_weight - (left->weight - left_subtract), + subtract_weight_map, subtract_count_map, subtract); + } + subtract_weight_map[this] += subtract; + subtract_count_map[this]++; + return return_idx; +} +} +} diff --git a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h new file mode 100644 index 0000000000000000000000000000000000000000..1787ab23b04316de9ad0622ff5524bc88bd51fe1 --- /dev/null +++ b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h @@ -0,0 +1,58 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/fluid/distributed/table/graph/graph_edge.h" +namespace paddle { +namespace distributed { + +class Sampler { + public: + virtual ~Sampler() {} + virtual void build(GraphEdgeBlob *edges) = 0; + virtual std::vector sample_k(int k) = 0; +}; + +class RandomSampler : public Sampler { + public: + virtual ~RandomSampler() {} + virtual void build(GraphEdgeBlob *edges); + virtual std::vector sample_k(int k); + GraphEdgeBlob *edges; +}; + +class WeightedSampler : public Sampler { + public: + WeightedSampler(); + virtual ~WeightedSampler(); + WeightedSampler *left, *right; + float weight; + int count; + int idx; + GraphEdgeBlob *edges; + virtual void build(GraphEdgeBlob *edges); + virtual void build_one(WeightedGraphEdgeBlob *edges, int start, int end); + virtual std::vector sample_k(int k); + + private: + int sample(float query_weight, + std::unordered_map &subtract_weight_map, + std::unordered_map &subtract_count_map, + float &subtract); +}; +} +} diff --git a/paddle/fluid/distributed/table/graph_edge.cc b/paddle/fluid/distributed/table/graph_edge.cc new file mode 100644 index 0000000000000000000000000000000000000000..cc90f4c6516c1873b078b96c550d0d52ac5d3b9c --- /dev/null +++ b/paddle/fluid/distributed/table/graph_edge.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/graph_edge.h" +#include +namespace paddle { +namespace distributed { + +void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) { + id_arr.push_back(id); +} + +void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) { + id_arr.push_back(id); + weight_arr.push_back(weight); +} +} +} diff --git a/paddle/fluid/distributed/table/graph_edge.h b/paddle/fluid/distributed/table/graph_edge.h new file mode 100644 index 0000000000000000000000000000000000000000..3dfe5a6f357a7cd7d79834a20b6411995665f4fa --- /dev/null +++ b/paddle/fluid/distributed/table/graph_edge.h @@ -0,0 +1,46 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +namespace paddle { +namespace distributed { + +class GraphEdgeBlob { + public: + GraphEdgeBlob() {} + virtual ~GraphEdgeBlob() {} + size_t size() { return id_arr.size(); } + virtual void add_edge(uint64_t id, float weight); + uint64_t get_id(int idx) { return id_arr[idx]; } + virtual float get_weight(int idx) { return 1; } + + protected: + std::vector id_arr; +}; + +class WeightedGraphEdgeBlob : public GraphEdgeBlob { + public: + WeightedGraphEdgeBlob() {} + virtual ~WeightedGraphEdgeBlob() {} + virtual void add_edge(uint64_t id, float weight); + virtual float get_weight(int idx) { return weight_arr[idx]; } + + protected: + std::vector weight_arr; +}; +} +} diff --git a/paddle/fluid/distributed/table/graph_node.cc b/paddle/fluid/distributed/table/graph_node.cc new file mode 100644 index 0000000000000000000000000000000000000000..27a2cafaf4f0fec95de818204ebd191a5083e50a --- /dev/null +++ b/paddle/fluid/distributed/table/graph_node.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/graph_node.h" +#include +namespace paddle { +namespace distributed { + +GraphNode::~GraphNode() { + if (sampler != nullptr) { + delete sampler; + sampler = nullptr; + } + if (edges != nullptr) { + delete edges; + edges = nullptr; + } +} + +int Node::weight_size = sizeof(float); +int Node::id_size = sizeof(uint64_t); +int Node::int_size = sizeof(int); + +int Node::get_size(bool need_feature) { return id_size + int_size; } + +void Node::to_buffer(char* buffer, bool need_feature) { + memcpy(buffer, &id, id_size); + buffer += id_size; + + int feat_num = 0; + memcpy(buffer, &feat_num, sizeof(int)); +} + +void Node::recover_from_buffer(char* buffer) { memcpy(&id, buffer, id_size); } + +int FeatureNode::get_size(bool need_feature) { + int size = id_size + int_size; // id, feat_num + if (need_feature) { + size += feature.size() * int_size; + for (const std::string& fea : feature) { + size += fea.size(); + } + } + return size; +} + +void GraphNode::build_edges(bool is_weighted) { + if (edges == nullptr) { + if (is_weighted == true) { + edges = new WeightedGraphEdgeBlob(); + } else { + edges = new GraphEdgeBlob(); + } + } +} +void GraphNode::build_sampler(std::string sample_type) { + if (sample_type == "random") { + sampler = new RandomSampler(); + } else if (sample_type == "weighted") { + sampler = new WeightedSampler(); + } + sampler->build(edges); +} +void FeatureNode::to_buffer(char* buffer, bool need_feature) { + memcpy(buffer, &id, id_size); + buffer += id_size; + + int feat_num = 0; + int feat_len; + if (need_feature) { + feat_num += feature.size(); + memcpy(buffer, &feat_num, sizeof(int)); + buffer += sizeof(int); + for (int i = 0; i < feat_num; ++i) { + feat_len = feature[i].size(); + memcpy(buffer, &feat_len, sizeof(int)); + buffer += sizeof(int); + memcpy(buffer, feature[i].c_str(), feature[i].size()); + buffer += feature[i].size(); + } + } else { + memcpy(buffer, &feat_num, sizeof(int)); + } +} +void FeatureNode::recover_from_buffer(char* buffer) { + int feat_num, feat_len; + memcpy(&id, buffer, id_size); + buffer += id_size; + + memcpy(&feat_num, buffer, sizeof(int)); + buffer += sizeof(int); + + feature.clear(); + for (int i = 0; i < feat_num; ++i) { + memcpy(&feat_len, buffer, sizeof(int)); + buffer += sizeof(int); + + char str[feat_len + 1]; + memcpy(str, buffer, feat_len); + buffer += feat_len; + str[feat_len] = '\0'; + feature.push_back(std::string(str)); + } +} +} +} diff --git a/paddle/fluid/distributed/table/graph_node.h b/paddle/fluid/distributed/table/graph_node.h new file mode 100644 index 0000000000000000000000000000000000000000..c3e8e3ce5b50d06945857ded1db168f84f955c5f --- /dev/null +++ b/paddle/fluid/distributed/table/graph_node.h @@ -0,0 +1,127 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/distributed/table/graph_weighted_sampler.h" +namespace paddle { +namespace distributed { + +class Node { + public: + Node() {} + Node(uint64_t id) : id(id) {} + virtual ~Node() {} + static int id_size, int_size, weight_size; + uint64_t get_id() { return id; } + void set_id(uint64_t id) { this->id = id; } + + virtual void build_edges(bool is_weighted) {} + virtual void build_sampler(std::string sample_type) {} + virtual void add_edge(uint64_t id, float weight) {} + virtual std::vector sample_k(int k) { return std::vector(); } + virtual uint64_t get_neighbor_id(int idx) { return 0; } + virtual float get_neighbor_weight(int idx) { return 1.; } + + virtual int get_size(bool need_feature); + virtual void to_buffer(char *buffer, bool need_feature); + virtual void recover_from_buffer(char *buffer); + virtual std::string get_feature(int idx) { return std::string(""); } + virtual void set_feature(int idx, std::string str) {} + virtual void set_feature_size(int size) {} + virtual int get_feature_size() { return 0; } + + protected: + uint64_t id; +}; + +class GraphNode : public Node { + public: + GraphNode() : Node(), sampler(nullptr), edges(nullptr) {} + GraphNode(uint64_t id) : Node(id), sampler(nullptr), edges(nullptr) {} + virtual ~GraphNode(); + virtual void build_edges(bool is_weighted); + virtual void build_sampler(std::string sample_type); + virtual void add_edge(uint64_t id, float weight) { + edges->add_edge(id, weight); + } + virtual std::vector sample_k(int k) { return sampler->sample_k(k); } + virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); } + virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); } + + protected: + Sampler *sampler; + GraphEdgeBlob *edges; +}; + +class FeatureNode : public Node { + public: + FeatureNode() : Node() {} + FeatureNode(uint64_t id) : Node(id) {} + virtual ~FeatureNode() {} + virtual int get_size(bool need_feature); + virtual void to_buffer(char *buffer, bool need_feature); + virtual void recover_from_buffer(char *buffer); + virtual std::string get_feature(int idx) { + if (idx < (int)this->feature.size()) { + return this->feature[idx]; + } else { + return std::string(""); + } + } + + virtual void set_feature(int idx, std::string str) { + if (idx >= (int)this->feature.size()) { + this->feature.resize(idx + 1); + } + this->feature[idx] = str; + } + virtual void set_feature_size(int size) { this->feature.resize(size); } + virtual int get_feature_size() { return this->feature.size(); } + + template + static std::string parse_value_to_bytes(std::vector feat_str) { + T v; + size_t Tsize = sizeof(T) * feat_str.size(); + char buffer[Tsize]; + for (size_t i = 0; i < feat_str.size(); i++) { + std::stringstream ss(feat_str[i]); + ss >> v; + std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T)); + } + return std::string(buffer, Tsize); + } + + template + static std::vector parse_bytes_to_array(std::string feat_str) { + T v; + std::vector out; + size_t start = 0; + const char *buffer = feat_str.data(); + while (start < feat_str.size()) { + std::memcpy((char *)&v, buffer + start, sizeof(T)); + start += sizeof(T); + out.push_back(v); + } + return out; + } + + protected: + std::vector feature; +}; +} +} diff --git a/paddle/fluid/distributed/table/graph_weighted_sampler.cc b/paddle/fluid/distributed/table/graph_weighted_sampler.cc new file mode 100644 index 0000000000000000000000000000000000000000..059a1d64bc392d7ef6936c008bbeec3bef3a5fb9 --- /dev/null +++ b/paddle/fluid/distributed/table/graph_weighted_sampler.cc @@ -0,0 +1,150 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/graph_weighted_sampler.h" +#include +#include +namespace paddle { +namespace distributed { + +void RandomSampler::build(GraphEdgeBlob *edges) { this->edges = edges; } + +std::vector RandomSampler::sample_k(int k) { + int n = edges->size(); + if (k > n) { + k = n; + } + struct timespec tn; + clock_gettime(CLOCK_REALTIME, &tn); + srand(tn.tv_nsec); + std::vector sample_result; + std::unordered_map replace_map; + while (k--) { + int rand_int = rand() % n; + auto iter = replace_map.find(rand_int); + if (iter == replace_map.end()) { + sample_result.push_back(rand_int); + } else { + sample_result.push_back(iter->second); + } + + iter = replace_map.find(n - 1); + if (iter == replace_map.end()) { + replace_map[rand_int] = n - 1; + } else { + replace_map[rand_int] = iter->second; + } + --n; + } + return sample_result; +} + +WeightedSampler::WeightedSampler() { + left = nullptr; + right = nullptr; + edges = nullptr; +} + +WeightedSampler::~WeightedSampler() { + if (left != nullptr) { + delete left; + left = nullptr; + } + if (right != nullptr) { + delete right; + right = nullptr; + } +} + +void WeightedSampler::build(GraphEdgeBlob *edges) { + if (left != nullptr) { + delete left; + left = nullptr; + } + if (right != nullptr) { + delete right; + right = nullptr; + } + return build_one((WeightedGraphEdgeBlob *)edges, 0, edges->size()); +} + +void WeightedSampler::build_one(WeightedGraphEdgeBlob *edges, int start, + int end) { + count = 0; + this->edges = edges; + if (start + 1 == end) { + left = right = nullptr; + idx = start; + count = 1; + weight = edges->get_weight(idx); + + } else { + left = new WeightedSampler(); + right = new WeightedSampler(); + left->build_one(edges, start, start + (end - start) / 2); + right->build_one(edges, start + (end - start) / 2, end); + weight = left->weight + right->weight; + count = left->count + right->count; + } +} +std::vector WeightedSampler::sample_k(int k) { + if (k > count) { + k = count; + } + std::vector sample_result; + float subtract; + std::unordered_map subtract_weight_map; + std::unordered_map subtract_count_map; + struct timespec tn; + clock_gettime(CLOCK_REALTIME, &tn); + srand(tn.tv_nsec); + while (k--) { + float query_weight = rand() % 100000 / 100000.0; + query_weight *= weight - subtract_weight_map[this]; + sample_result.push_back(sample(query_weight, subtract_weight_map, + subtract_count_map, subtract)); + } + return sample_result; +} + +int WeightedSampler::sample( + float query_weight, + std::unordered_map &subtract_weight_map, + std::unordered_map &subtract_count_map, + float &subtract) { + if (left == nullptr) { + subtract_weight_map[this] = weight; + subtract = weight; + subtract_count_map[this] = 1; + return idx; + } + int left_count = left->count - subtract_count_map[left]; + int right_count = right->count - subtract_count_map[right]; + float left_subtract = subtract_weight_map[left]; + int return_idx; + if (right_count == 0 || + left_count > 0 && left->weight - left_subtract >= query_weight) { + return_idx = left->sample(query_weight, subtract_weight_map, + subtract_count_map, subtract); + } else { + return_idx = + right->sample(query_weight - (left->weight - left_subtract), + subtract_weight_map, subtract_count_map, subtract); + } + subtract_weight_map[this] += subtract; + subtract_count_map[this]++; + return return_idx; +} +} +} diff --git a/paddle/fluid/distributed/table/graph_weighted_sampler.h b/paddle/fluid/distributed/table/graph_weighted_sampler.h new file mode 100644 index 0000000000000000000000000000000000000000..cfc341d27c6b766fcee57e8973a4353d4fe93b4e --- /dev/null +++ b/paddle/fluid/distributed/table/graph_weighted_sampler.h @@ -0,0 +1,58 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/fluid/distributed/table/graph_edge.h" +namespace paddle { +namespace distributed { + +class Sampler { + public: + virtual ~Sampler() {} + virtual void build(GraphEdgeBlob *edges) = 0; + virtual std::vector sample_k(int k) = 0; +}; + +class RandomSampler : public Sampler { + public: + virtual ~RandomSampler() {} + virtual void build(GraphEdgeBlob *edges); + virtual std::vector sample_k(int k); + GraphEdgeBlob *edges; +}; + +class WeightedSampler : public Sampler { + public: + WeightedSampler(); + virtual ~WeightedSampler(); + WeightedSampler *left, *right; + float weight; + int count; + int idx; + GraphEdgeBlob *edges; + virtual void build(GraphEdgeBlob *edges); + virtual void build_one(WeightedGraphEdgeBlob *edges, int start, int end); + virtual std::vector sample_k(int k); + + private: + int sample(float query_weight, + std::unordered_map &subtract_weight_map, + std::unordered_map &subtract_count_map, + float &subtract); +}; +} +} diff --git a/paddle/fluid/distributed/table/sparse_geo_table.cc b/paddle/fluid/distributed/table/sparse_geo_table.cc index 9b276e7de5c92d495f9d40535033b0a82186bc82..04cd1136382a4e24eb0a6d196ec01ad68ed56309 100644 --- a/paddle/fluid/distributed/table/sparse_geo_table.cc +++ b/paddle/fluid/distributed/table/sparse_geo_table.cc @@ -22,8 +22,17 @@ int32_t SparseGeoTable::pull_geo_param(const uint32_t trainer_id, std::vector* ids) { geo_recorder->GetAndClear(trainer_id, ids); auto dim = _config.common().dims()[0]; + + std::vector frequencies; + frequencies.resize(ids->size(), 1); + + auto pull_value = PullSparseValue(ids->size(), dim); + pull_value.is_training_ = true; + pull_value.feasigns_ = ids->data(); + pull_value.frequencies_ = frequencies.data(); + values->resize(ids->size() * dim); - CommonSparseTable::pull_sparse(values->data(), ids->data(), ids->size()); + CommonSparseTable::pull_sparse(values->data(), pull_value); return 0; } diff --git a/paddle/fluid/distributed/table/table.cc b/paddle/fluid/distributed/table/table.cc index dfaaa6ffc12c2b363de5f26df01ab4b8db9f0153..600be954cb59663fff6f867c020248a92e81a151 100644 --- a/paddle/fluid/distributed/table/table.cc +++ b/paddle/fluid/distributed/table/table.cc @@ -18,6 +18,7 @@ #include "paddle/fluid/distributed/common/registerer.h" #include "paddle/fluid/distributed/table/common_dense_table.h" +#include "paddle/fluid/distributed/table/common_graph_table.h" #include "paddle/fluid/distributed/table/common_sparse_table.h" #include "paddle/fluid/distributed/table/sparse_geo_table.h" #include "paddle/fluid/distributed/table/tensor_accessor.h" @@ -25,7 +26,7 @@ namespace paddle { namespace distributed { - +REGISTER_PSCORE_CLASS(Table, GraphTable); REGISTER_PSCORE_CLASS(Table, CommonDenseTable); REGISTER_PSCORE_CLASS(Table, CommonSparseTable); REGISTER_PSCORE_CLASS(Table, SparseGeoTable); @@ -75,5 +76,6 @@ int32_t Table::initialize_accessor() { _value_accesor.reset(accessor); return 0; } + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/table/table.h b/paddle/fluid/distributed/table/table.h index 65c99d2bbd40d4567f49eb84bd84173a0a3fee0b..81a1ff5eced2bb36b8f917a31de1e214b272bfa3 100644 --- a/paddle/fluid/distributed/table/table.h +++ b/paddle/fluid/distributed/table/table.h @@ -21,6 +21,8 @@ #include #include #include "paddle/fluid/distributed/table/accessor.h" +#include "paddle/fluid/distributed/table/depends/sparse_utils.h" +#include "paddle/fluid/distributed/table/graph/graph_node.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/device_context.h" @@ -46,10 +48,17 @@ class Table { return 0; } - virtual int32_t pull_sparse(float *values, const uint64_t *keys, - size_t num) = 0; + virtual int32_t pull_sparse_ptr(char **pull_values, const uint64_t *keys, + size_t num) { + VLOG(0) << "NOT IMPLEMENT"; + return 0; + } + virtual int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) = 0; virtual int32_t push_sparse(const uint64_t *keys, const float *values, size_t num) = 0; + virtual int32_t push_sparse(const uint64_t *keys, const float **values, + size_t num){}; virtual int32_t push_sparse_param(const uint64_t *keys, const float *values, size_t num) { return 0; @@ -141,5 +150,6 @@ class TableManager { TableManager() {} ~TableManager() {} }; + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/table/tensor_table.h b/paddle/fluid/distributed/table/tensor_table.h index 1a8f1a9cd9adb841c3ed1fcf849a3a293c47cc52..080682d131420b5b57ce470b6b570fe24a1925b3 100644 --- a/paddle/fluid/distributed/table/tensor_table.h +++ b/paddle/fluid/distributed/table/tensor_table.h @@ -52,8 +52,8 @@ class TensorTable : public Table { int32_t push_dense(const float *values, size_t num) override { return 0; } - int32_t pull_sparse(float *values, const uint64_t *keys, - size_t num) override { + int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) override { return 0; } int32_t push_sparse(const uint64_t *keys, const float *values, @@ -102,8 +102,8 @@ class DenseTensorTable : public TensorTable { DenseTensorTable() {} virtual ~DenseTensorTable() {} - int32_t pull_sparse(float *values, const uint64_t *keys, - size_t num) override { + int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) override { return 0; } int32_t push_sparse(const uint64_t *keys, const float *values, @@ -158,8 +158,8 @@ class GlobalStepTable : public DenseTensorTable { GlobalStepTable() {} virtual ~GlobalStepTable() {} - int32_t pull_sparse(float *values, const uint64_t *keys, - size_t num) override { + int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) override { return 0; } int32_t push_sparse(const uint64_t *keys, const float *values, diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt index adedd049023daa053aebcff60dd245408f4901bd..af87e1b6cc61d190cf06b601f05455d8ac976d71 100644 --- a/paddle/fluid/distributed/test/CMakeLists.txt +++ b/paddle/fluid/distributed/test/CMakeLists.txt @@ -1,8 +1,10 @@ set_source_files_properties(table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(table_test SRCS table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS}) +cc_test(table_test SRCS table_test.cc DEPS common_table table tensor_accessor +ps_framework_proto ${COMMON_DEPS} ${RPC_DEPS}) set_source_files_properties(dense_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(dense_table_test SRCS dense_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS}) +cc_test(dense_table_test SRCS dense_table_test.cc DEPS common_table table +tensor_accessor ps_framework_proto ${COMMON_DEPS} ${RPC_DEPS}) set_source_files_properties(barrier_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(barrier_table_test SRCS barrier_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS}) @@ -15,3 +17,6 @@ cc_test(brpc_service_sparse_sgd_test SRCS brpc_service_sparse_sgd_test.cc DEPS s set_source_files_properties(brpc_utils_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(brpc_utils_test SRCS brpc_utils_test.cc DEPS brpc_utils scope math_function ${COMMON_DEPS} ${RPC_DEPS}) + +set_source_files_properties(graph_node_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS}) diff --git a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc index fbd236012f523715451e9c21d3f2028f88d573f3..8fb3434af6e281762b762bbc8d01b372e5c0ee34 100644 --- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc +++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc @@ -212,8 +212,8 @@ void RunBrpcPushSparse() { /*-----------------------Test Server Init----------------------------------*/ LOG(INFO) << "Run pull_sparse_param"; - auto pull_status = worker_ptr_->pull_sparse(fea_value_ptr.data(), 0, - fea_keys.data(), fea_keys.size()); + auto pull_status = worker_ptr_->pull_sparse( + fea_value_ptr.data(), 0, fea_keys.data(), fea_keys.size(), true); pull_status.wait(); for (size_t idx = 0; idx < tensor->numel(); ++idx) { fea_values.data()[idx] *= 2.0; @@ -241,7 +241,7 @@ void RunBrpcPushSparse() { push_status.wait(); auto pull_param_status = worker_ptr_->pull_sparse( - fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size()); + fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size(), true); pull_param_status.wait(); for (size_t idx = 0; idx < tensor->numel(); ++idx) { @@ -275,7 +275,7 @@ void RunBrpcPushSparse() { push_grad_status.wait(); auto pull_update_status = worker_ptr_->pull_sparse( - fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size()); + fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size(), true); pull_update_status.wait(); for (size_t idx = 0; idx < tensor->numel(); ++idx) { diff --git a/paddle/fluid/distributed/test/geo_table_test.cc b/paddle/fluid/distributed/test/geo_table_test.cc index 22e11acf6584eefa3e41ccd950feb2dfb4bf3720..c9f15db3f788e13ca2f9a8279358358f1c50131b 100644 --- a/paddle/fluid/distributed/test/geo_table_test.cc +++ b/paddle/fluid/distributed/test/geo_table_test.cc @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/table/common_dense_table.h" #include "paddle/fluid/distributed/table/common_sparse_table.h" +#include "paddle/fluid/distributed/table/depends/sparse_utils.h" #include "paddle/fluid/distributed/table/sparse_geo_table.h" #include "paddle/fluid/distributed/table/table.h" @@ -53,14 +54,18 @@ TEST(SparseGeoTable, SSUM) { // test push_sparse_param, and create params std::vector init_keys = {0, 1, 2, 3, 4}; + std::vector init_fres = {1, 1, 1, 1, 1}; std::vector init_values; for (size_t i = 0; i < init_keys.size() * emb_dim; i++) { init_values.push_back(0.0); } table->push_sparse_param(init_keys.data(), init_values.data(), init_keys.size()); + std::vector pull_values(init_values.size()); - table->pull_sparse(pull_values.data(), init_keys.data(), init_keys.size()); + auto value = PullSparseValue(init_keys, init_fres, emb_dim); + table->pull_sparse(pull_values.data(), value); + for (size_t i = 0; i < init_keys.size() * emb_dim; i++) { ASSERT_TRUE(abs(pull_values[i] - init_values[i]) < 1e-5); } diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..b268bb449e14619048e89c8933dbae7daf66537b --- /dev/null +++ b/paddle/fluid/distributed/test/graph_node_test.cc @@ -0,0 +1,556 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include // NOLINT +#include +#include +#include +#include // NOLINT +#include +#include +#include "google/protobuf/text_format.h" + +#include "gtest/gtest.h" +#include "paddle/fluid/distributed/ps.pb.h" +#include "paddle/fluid/distributed/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/service/brpc_ps_server.h" +#include "paddle/fluid/distributed/service/env.h" +#include "paddle/fluid/distributed/service/graph_brpc_client.h" +#include "paddle/fluid/distributed/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/service/graph_py_service.h" +#include "paddle/fluid/distributed/service/ps_client.h" +#include "paddle/fluid/distributed/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/service/service.h" +#include "paddle/fluid/distributed/table/graph/graph_node.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/printf.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace operators = paddle::operators; +namespace math = paddle::operators::math; +namespace memory = paddle::memory; +namespace distributed = paddle::distributed; + +void testSampleNodes( + std::shared_ptr& worker_ptr_) { + std::vector ids; + auto pull_status = worker_ptr_->random_sample_nodes(0, 0, 6, ids); + std::unordered_set s; + std::unordered_set s1 = {37, 59}; + pull_status.wait(); + for (auto id : ids) s.insert(id); + ASSERT_EQ(true, s.size() == s1.size()); + for (auto id : s) { + ASSERT_EQ(true, s1.find(id) != s1.end()); + } +} + +void testFeatureNodeSerializeInt() { + std::string out = + distributed::FeatureNode::parse_value_to_bytes({"123", "345"}); + std::vector out2 = + distributed::FeatureNode::parse_bytes_to_array(out); + ASSERT_EQ(out2[0], 123); + ASSERT_EQ(out2[1], 345); +} + +void testFeatureNodeSerializeInt64() { + std::string out = + distributed::FeatureNode::parse_value_to_bytes({"123", "345"}); + std::vector out2 = + distributed::FeatureNode::parse_bytes_to_array(out); + ASSERT_EQ(out2[0], 123); + ASSERT_EQ(out2[1], 345); +} + +void testFeatureNodeSerializeFloat32() { + std::string out = distributed::FeatureNode::parse_value_to_bytes( + {"123.123", "345.123"}); + std::vector out2 = + distributed::FeatureNode::parse_bytes_to_array(out); + float eps; + std::cout << "Float " << out2[0] << " " << 123.123 << std::endl; + eps = out2[0] - 123.123; + ASSERT_LE(eps * eps, 1e-5); + eps = out2[1] - 345.123; + ASSERT_LE(eps * eps, 1e-5); +} + +void testFeatureNodeSerializeFloat64() { + std::string out = distributed::FeatureNode::parse_value_to_bytes( + {"123.123", "345.123"}); + std::vector out2 = + distributed::FeatureNode::parse_bytes_to_array(out); + float eps; + eps = out2[0] - 123.123; + std::cout << "Float64 " << out2[0] << " " << 123.123 << std::endl; + ASSERT_LE(eps * eps, 1e-5); + eps = out2[1] - 345.123; + ASSERT_LE(eps * eps, 1e-5); +} + +void testSingleSampleNeighboor( + std::shared_ptr& worker_ptr_) { + std::vector>> vs; + auto pull_status = worker_ptr_->batch_sample_neighboors( + 0, std::vector(1, 37), 4, vs); + pull_status.wait(); + + std::unordered_set s; + std::unordered_set s1 = {112, 45, 145}; + for (auto g : vs[0]) { + s.insert(g.first); + } + ASSERT_EQ(s.size(), 3); + for (auto g : s) { + ASSERT_EQ(true, s1.find(g) != s1.end()); + } + VLOG(0) << "test single done"; + s.clear(); + s1.clear(); + vs.clear(); + pull_status = worker_ptr_->batch_sample_neighboors( + 0, std::vector(1, 96), 4, vs); + pull_status.wait(); + s1 = {111, 48, 247}; + for (auto g : vs[0]) { + s.insert(g.first); + } + ASSERT_EQ(s.size(), 3); + for (auto g : s) { + ASSERT_EQ(true, s1.find(g) != s1.end()); + } +} + +void testBatchSampleNeighboor( + std::shared_ptr& worker_ptr_) { + std::vector>> vs; + std::vector v = {37, 96}; + auto pull_status = worker_ptr_->batch_sample_neighboors(0, v, 4, vs); + pull_status.wait(); + std::unordered_set s; + std::unordered_set s1 = {112, 45, 145}; + for (auto g : vs[0]) { + s.insert(g.first); + } + ASSERT_EQ(s.size(), 3); + for (auto g : s) { + ASSERT_EQ(true, s1.find(g) != s1.end()); + } + s.clear(); + s1.clear(); + s1 = {111, 48, 247}; + for (auto g : vs[1]) { + s.insert(g.first); + } + ASSERT_EQ(s.size(), 3); + for (auto g : s) { + ASSERT_EQ(true, s1.find(g) != s1.end()); + } +} + +void testGraphToBuffer(); +// std::string nodes[] = {std::string("37\taa\t45;0.34\t145;0.31\t112;0.21"), +// std::string("96\tfeature\t48;1.4\t247;0.31\t111;1.21"), +// std::string("59\ttreat\t45;0.34\t145;0.31\t112;0.21"), +// std::string("97\tfood\t48;1.4\t247;0.31\t111;1.21")}; + +std::string edges[] = { + std::string("37\t45\t0.34"), std::string("37\t145\t0.31"), + std::string("37\t112\t0.21"), std::string("96\t48\t1.4"), + std::string("96\t247\t0.31"), std::string("96\t111\t1.21"), + std::string("59\t45\t0.34"), std::string("59\t145\t0.31"), + std::string("59\t122\t0.21"), std::string("97\t48\t0.34"), + std::string("97\t247\t0.31"), std::string("97\t111\t0.21")}; +char edge_file_name[] = "edges.txt"; + +std::string nodes[] = { + std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"), + std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"), + std::string("user\t59\ta 0.11\tb 11 14"), + std::string("user\t97\ta 0.11\tb 12 11"), + std::string("item\t45\ta 0.21"), + std::string("item\t145\ta 0.21"), + std::string("item\t112\ta 0.21"), + std::string("item\t48\ta 0.21"), + std::string("item\t247\ta 0.21"), + std::string("item\t111\ta 0.21"), + std::string("item\t46\ta 0.21"), + std::string("item\t146\ta 0.21"), + std::string("item\t122\ta 0.21"), + std::string("item\t49\ta 0.21"), + std::string("item\t248\ta 0.21"), + std::string("item\t113\ta 0.21")}; +char node_file_name[] = "nodes.txt"; + +void prepare_file(char file_name[], bool load_edge) { + std::ofstream ofile; + ofile.open(file_name); + if (load_edge) { + for (auto x : edges) { + ofile << x << std::endl; + } + } else { + for (auto x : nodes) { + ofile << x << std::endl; + } + } + ofile.close(); +} +void GetDownpourSparseTableProto( + ::paddle::distributed::TableParameter* sparse_table_proto) { + sparse_table_proto->set_table_id(0); + sparse_table_proto->set_table_class("GraphTable"); + sparse_table_proto->set_shard_num(127); + sparse_table_proto->set_type(::paddle::distributed::PS_SPARSE_TABLE); + ::paddle::distributed::TableAccessorParameter* accessor_proto = + sparse_table_proto->mutable_accessor(); + accessor_proto->set_accessor_class("CommMergeAccessor"); +} + +::paddle::distributed::PSParameter GetServerProto() { + // Generate server proto desc + ::paddle::distributed::PSParameter server_fleet_desc; + ::paddle::distributed::ServerParameter* server_proto = + server_fleet_desc.mutable_server_param(); + ::paddle::distributed::DownpourServerParameter* downpour_server_proto = + server_proto->mutable_downpour_server_param(); + ::paddle::distributed::ServerServiceParameter* server_service_proto = + downpour_server_proto->mutable_service_param(); + server_service_proto->set_service_class("GraphBrpcService"); + server_service_proto->set_server_class("GraphBrpcServer"); + server_service_proto->set_client_class("GraphBrpcClient"); + server_service_proto->set_start_server_port(0); + server_service_proto->set_server_thread_num(12); + + ::paddle::distributed::TableParameter* sparse_table_proto = + downpour_server_proto->add_downpour_table_param(); + GetDownpourSparseTableProto(sparse_table_proto); + return server_fleet_desc; +} + +::paddle::distributed::PSParameter GetWorkerProto() { + ::paddle::distributed::PSParameter worker_fleet_desc; + ::paddle::distributed::WorkerParameter* worker_proto = + worker_fleet_desc.mutable_worker_param(); + + ::paddle::distributed::DownpourWorkerParameter* downpour_worker_proto = + worker_proto->mutable_downpour_worker_param(); + + ::paddle::distributed::TableParameter* worker_sparse_table_proto = + downpour_worker_proto->add_downpour_table_param(); + GetDownpourSparseTableProto(worker_sparse_table_proto); + + ::paddle::distributed::ServerParameter* server_proto = + worker_fleet_desc.mutable_server_param(); + ::paddle::distributed::DownpourServerParameter* downpour_server_proto = + server_proto->mutable_downpour_server_param(); + ::paddle::distributed::ServerServiceParameter* server_service_proto = + downpour_server_proto->mutable_service_param(); + server_service_proto->set_service_class("GraphBrpcService"); + server_service_proto->set_server_class("GraphBrpcServer"); + server_service_proto->set_client_class("GraphBrpcClient"); + server_service_proto->set_start_server_port(0); + server_service_proto->set_server_thread_num(12); + + ::paddle::distributed::TableParameter* server_sparse_table_proto = + downpour_server_proto->add_downpour_table_param(); + GetDownpourSparseTableProto(server_sparse_table_proto); + + return worker_fleet_desc; +} + +/*-------------------------------------------------------------------------*/ + +std::string ip_ = "127.0.0.1", ip2 = "127.0.0.1"; +uint32_t port_ = 5209, port2 = 5210; + +std::vector host_sign_list_; + +std::shared_ptr pserver_ptr_, + pserver_ptr2; + +std::shared_ptr worker_ptr_; + +void RunServer() { + LOG(INFO) << "init first server"; + ::paddle::distributed::PSParameter server_proto = GetServerProto(); + + auto _ps_env = paddle::distributed::PaddlePSEnvironment(); + _ps_env.set_ps_servers(&host_sign_list_, 2); // test + pserver_ptr_ = std::shared_ptr( + (paddle::distributed::GraphBrpcServer*) + paddle::distributed::PSServerFactory::create(server_proto)); + std::vector empty_vec; + framework::ProgramDesc empty_prog; + empty_vec.push_back(empty_prog); + pserver_ptr_->configure(server_proto, _ps_env, 0, empty_vec); + LOG(INFO) << "first server, run start(ip,port)"; + pserver_ptr_->start(ip_, port_); + LOG(INFO) << "init first server Done"; +} + +void RunServer2() { + LOG(INFO) << "init second server"; + ::paddle::distributed::PSParameter server_proto2 = GetServerProto(); + + auto _ps_env2 = paddle::distributed::PaddlePSEnvironment(); + _ps_env2.set_ps_servers(&host_sign_list_, 2); // test + pserver_ptr2 = std::shared_ptr( + (paddle::distributed::GraphBrpcServer*) + paddle::distributed::PSServerFactory::create(server_proto2)); + std::vector empty_vec2; + framework::ProgramDesc empty_prog2; + empty_vec2.push_back(empty_prog2); + pserver_ptr2->configure(server_proto2, _ps_env2, 1, empty_vec2); + pserver_ptr2->start(ip2, port2); +} + +void RunClient( + std::map>& dense_regions, + int index, paddle::distributed::PsBaseService* service) { + ::paddle::distributed::PSParameter worker_proto = GetWorkerProto(); + paddle::distributed::PaddlePSEnvironment _ps_env; + auto servers_ = host_sign_list_.size(); + _ps_env = paddle::distributed::PaddlePSEnvironment(); + _ps_env.set_ps_servers(&host_sign_list_, servers_); + worker_ptr_ = std::shared_ptr( + (paddle::distributed::GraphBrpcClient*) + paddle::distributed::PSClientFactory::create(worker_proto)); + worker_ptr_->configure(worker_proto, dense_regions, _ps_env, 0); + worker_ptr_->set_shard_num(127); + worker_ptr_->set_local_channel(index); + worker_ptr_->set_local_graph_service( + (paddle::distributed::GraphBrpcService*)service); +} + +void RunBrpcPushSparse() { + setenv("http_proxy", "", 1); + setenv("https_proxy", "", 1); + prepare_file(edge_file_name, 1); + prepare_file(node_file_name, 0); + auto ph_host = paddle::distributed::PSHost(ip_, port_, 0); + host_sign_list_.push_back(ph_host.serialize_to_string()); + + // test-start + auto ph_host2 = paddle::distributed::PSHost(ip2, port2, 1); + host_sign_list_.push_back(ph_host2.serialize_to_string()); + // test-end + // Srart Server + std::thread* server_thread = new std::thread(RunServer); + std::thread* server_thread2 = new std::thread(RunServer2); + sleep(1); + + std::map> dense_regions; + dense_regions.insert( + std::pair>(0, {})); + auto regions = dense_regions[0]; + + RunClient(dense_regions, 0, pserver_ptr_->get_service()); + + /*-----------------------Test Server Init----------------------------------*/ + auto pull_status = + worker_ptr_->load(0, std::string(edge_file_name), std::string("e>")); + srand(time(0)); + pull_status.wait(); + std::vector>> vs; + testSampleNodes(worker_ptr_); + sleep(5); + testSingleSampleNeighboor(worker_ptr_); + testBatchSampleNeighboor(worker_ptr_); + pull_status = worker_ptr_->batch_sample_neighboors( + 0, std::vector(1, 10240001024), 4, vs); + pull_status.wait(); + ASSERT_EQ(0, vs[0].size()); + + std::vector nodes; + pull_status = worker_ptr_->pull_graph_list(0, 0, 0, 1, 1, nodes); + pull_status.wait(); + ASSERT_EQ(nodes.size(), 1); + ASSERT_EQ(nodes[0].get_id(), 37); + nodes.clear(); + pull_status = worker_ptr_->pull_graph_list(0, 0, 1, 4, 1, nodes); + pull_status.wait(); + ASSERT_EQ(nodes.size(), 1); + ASSERT_EQ(nodes[0].get_id(), 59); + for (auto g : nodes) { + std::cout << g.get_id() << std::endl; + } + distributed::GraphPyServer server1, server2; + distributed::GraphPyClient client1, client2; + std::string ips_str = "127.0.0.1:5211;127.0.0.1:5212"; + std::vector edge_types = {std::string("user2item")}; + std::vector node_types = {std::string("user"), + std::string("item")}; + VLOG(0) << "make 2 servers"; + server1.set_up(ips_str, 127, node_types, edge_types, 0); + server2.set_up(ips_str, 127, node_types, edge_types, 1); + + server1.add_table_feat_conf("user", "a", "float32", 1); + server1.add_table_feat_conf("user", "b", "int32", 2); + server1.add_table_feat_conf("user", "c", "string", 1); + server1.add_table_feat_conf("user", "d", "string", 1); + server1.add_table_feat_conf("item", "a", "float32", 1); + + server2.add_table_feat_conf("user", "a", "float32", 1); + server2.add_table_feat_conf("user", "b", "int32", 2); + server2.add_table_feat_conf("user", "c", "string", 1); + server2.add_table_feat_conf("user", "d", "string", 1); + server2.add_table_feat_conf("item", "a", "float32", 1); + + client1.set_up(ips_str, 127, node_types, edge_types, 0); + + client1.add_table_feat_conf("user", "a", "float32", 1); + client1.add_table_feat_conf("user", "b", "int32", 2); + client1.add_table_feat_conf("user", "c", "string", 1); + client1.add_table_feat_conf("user", "d", "string", 1); + client1.add_table_feat_conf("item", "a", "float32", 1); + + client2.set_up(ips_str, 127, node_types, edge_types, 1); + + client2.add_table_feat_conf("user", "a", "float32", 1); + client2.add_table_feat_conf("user", "b", "int32", 2); + client2.add_table_feat_conf("user", "c", "string", 1); + client2.add_table_feat_conf("user", "d", "string", 1); + client2.add_table_feat_conf("item", "a", "float32", 1); + + server1.start_server(false); + std::cout << "first server done" << std::endl; + server2.start_server(false); + std::cout << "second server done" << std::endl; + client1.start_client(); + std::cout << "first client done" << std::endl; + client2.start_client(); + std::cout << "first client done" << std::endl; + std::cout << "started" << std::endl; + VLOG(0) << "come to set local server"; + client1.bind_local_server(0, server1); + VLOG(0) << "first bound"; + client2.bind_local_server(1, server2); + VLOG(0) << "second bound"; + client1.load_node_file(std::string("user"), std::string(node_file_name)); + client1.load_node_file(std::string("item"), std::string(node_file_name)); + client1.load_edge_file(std::string("user2item"), std::string(edge_file_name), + 0); + nodes.clear(); + + nodes = client1.pull_graph_list(std::string("user"), 0, 1, 4, 1); + + ASSERT_EQ(nodes[0].get_id(), 59); + nodes.clear(); + + // Test Pull by step + + std::unordered_set count_item_nodes; + // pull by step 2 + for (int test_step = 1; test_step < 4; test_step++) { + count_item_nodes.clear(); + std::cout << "check pull graph list by step " << test_step << std::endl; + for (int server_id = 0; server_id < 2; server_id++) { + for (int start_step = 0; start_step < test_step; start_step++) { + nodes = client1.pull_graph_list(std::string("item"), server_id, + start_step, 12, test_step); + for (auto g : nodes) { + count_item_nodes.insert(g.get_id()); + } + nodes.clear(); + } + } + ASSERT_EQ(count_item_nodes.size(), 12); + } + + vs = client1.batch_sample_neighboors(std::string("user2item"), + std::vector(1, 96), 4); + ASSERT_EQ(vs[0].size(), 3); + std::vector node_ids; + node_ids.push_back(96); + node_ids.push_back(37); + vs = client1.batch_sample_neighboors(std::string("user2item"), node_ids, 4); + + ASSERT_EQ(vs.size(), 2); + std::vector nodes_ids = client2.random_sample_nodes("user", 0, 6); + ASSERT_EQ(nodes_ids.size(), 2); + ASSERT_EQ(true, (nodes_ids[0] == 59 && nodes_ids[1] == 37) || + (nodes_ids[0] == 37 && nodes_ids[1] == 59)); + + // Test get node feat + node_ids.clear(); + node_ids.push_back(37); + node_ids.push_back(96); + std::vector feature_names; + feature_names.push_back(std::string("c")); + feature_names.push_back(std::string("d")); + auto node_feat = + client1.get_node_feat(std::string("user"), node_ids, feature_names); + ASSERT_EQ(node_feat.size(), 2); + ASSERT_EQ(node_feat[0].size(), 2); + VLOG(0) << "get_node_feat: " << node_feat[0][0]; + VLOG(0) << "get_node_feat: " << node_feat[0][1]; + VLOG(0) << "get_node_feat: " << node_feat[1][0]; + VLOG(0) << "get_node_feat: " << node_feat[1][1]; + + // Test string + node_ids.clear(); + node_ids.push_back(37); + node_ids.push_back(96); + // std::vector feature_names; + feature_names.clear(); + feature_names.push_back(std::string("a")); + feature_names.push_back(std::string("b")); + node_feat = + client1.get_node_feat(std::string("user"), node_ids, feature_names); + ASSERT_EQ(node_feat.size(), 2); + ASSERT_EQ(node_feat[0].size(), 2); + VLOG(0) << "get_node_feat: " << node_feat[0][0].size(); + VLOG(0) << "get_node_feat: " << node_feat[0][1].size(); + VLOG(0) << "get_node_feat: " << node_feat[1][0].size(); + VLOG(0) << "get_node_feat: " << node_feat[1][1].size(); + + std::remove(edge_file_name); + std::remove(node_file_name); + LOG(INFO) << "Run stop_server"; + worker_ptr_->stop_server(); + LOG(INFO) << "Run finalize_worker"; + worker_ptr_->finalize_worker(); + testFeatureNodeSerializeInt(); + testFeatureNodeSerializeInt64(); + testFeatureNodeSerializeFloat32(); + testFeatureNodeSerializeFloat64(); + testGraphToBuffer(); + client1.stop_server(); +} + +void testGraphToBuffer() { + ::paddle::distributed::GraphNode s, s1; + s.set_feature_size(1); + s.set_feature(0, std::string("hhhh")); + s.set_id(65); + int size = s.get_size(true); + char str[size]; + s.to_buffer(str, true); + s1.recover_from_buffer(str); + ASSERT_EQ(s.get_id(), s1.get_id()); + VLOG(0) << s.get_feature(0); + VLOG(0) << s1.get_feature(0); +} + +TEST(RunBrpcPushSparse, Run) { RunBrpcPushSparse(); } diff --git a/paddle/fluid/distributed/test/sparse_table_test.cc b/paddle/fluid/distributed/test/sparse_table_test.cc index 6db95c5fac211b94db726ee77c9122a8824c2351..26bede392d6fade06dd29cf5e5a28295bb1cbc43 100644 --- a/paddle/fluid/distributed/test/sparse_table_test.cc +++ b/paddle/fluid/distributed/test/sparse_table_test.cc @@ -55,9 +55,14 @@ TEST(CommonSparseTable, SGD) { // pull parameters for create and check std::vector init_keys = {0, 1, 2, 3, 4}; + std::vector init_fres = {1, 1, 1, 1, 1}; + std::vector init_values; init_values.resize(init_keys.size() * emb_dim); - table->pull_sparse(init_values.data(), init_keys.data(), init_keys.size()); + + std::vector pull_values(init_values.size()); + auto value = PullSparseValue(init_keys, init_fres, emb_dim); + table->pull_sparse(init_values.data(), value); // for check std::vector total_gradients; @@ -100,7 +105,8 @@ TEST(CommonSparseTable, SGD) { std::vector pull_values; pull_values.resize(init_keys.size() * emb_dim); - table->pull_sparse(pull_values.data(), init_keys.data(), init_keys.size()); + table->pull_sparse(init_values.data(), value); + for (size_t i = 0; i < init_values.size(); ++i) { auto update_val = init_values[i] - 1.0 * total_gradients[i]; ASSERT_TRUE(abs(update_val - pull_values[i]) < 1e-5); @@ -148,9 +154,13 @@ TEST(CommonSparseTable, Adam) { // pull parameters for create and check std::vector init_keys = {0, 1, 2, 3, 4}; + std::vector init_fres = {1, 1, 1, 1, 1}; + std::vector init_values; init_values.resize(init_keys.size() * emb_dim); - table->pull_sparse(init_values.data(), init_keys.data(), init_keys.size()); + + auto value = PullSparseValue(init_keys, init_fres, emb_dim); + table->pull_sparse(init_values.data(), value); // push gradient std::vector> trainer_keys; diff --git a/paddle/fluid/distributed/thirdparty/round_robin.h b/paddle/fluid/distributed/thirdparty/round_robin.h new file mode 100644 index 0000000000000000000000000000000000000000..f5075b4545af04b7179fcd3ba37bea3ecd58b6c8 --- /dev/null +++ b/paddle/fluid/distributed/thirdparty/round_robin.h @@ -0,0 +1,2685 @@ +// ______ _____ ______ _________ +// ______________ ___ /_ ___(_)_______ ___ /_ ______ ______ ______ / +// __ ___/_ __ \__ __ \__ / __ __ \ __ __ \_ __ \_ __ \_ __ / +// _ / / /_/ /_ /_/ /_ / _ / / / _ / / // /_/ // /_/ // /_/ / +// /_/ \____/ /_.___/ /_/ /_/ /_/ ________/_/ /_/ \____/ \____/ \__,_/ +// _/_____/ +// +// Fast & memory efficient hashtable based on robin hood hashing for +// C++11/14/17/20 +// https://github.com/martinus/robin-hood-hashing +// +// Licensed under the MIT License . +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2021 Martin Ankerl +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#ifndef ROBIN_HOOD_H_INCLUDED +#define ROBIN_HOOD_H_INCLUDED + +// see https://semver.org/ +#define ROBIN_HOOD_VERSION_MAJOR 3 // for incompatible API changes +#define ROBIN_HOOD_VERSION_MINOR \ + 11 // for adding functionality in a backwards-compatible manner +#define ROBIN_HOOD_VERSION_PATCH 1 // for backwards-compatible bug fixes + +#include +#include +#include +#include +#include // only to support hash of smart pointers +#include +#include +#include +#include +#if __cplusplus >= 201703L +#include +#endif + +// #define ROBIN_HOOD_LOG_ENABLED +#ifdef ROBIN_HOOD_LOG_ENABLED +#include +#define ROBIN_HOOD_LOG(...) \ + std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << __VA_ARGS__ \ + << std::endl; +#else +#define ROBIN_HOOD_LOG(x) +#endif + +// #define ROBIN_HOOD_TRACE_ENABLED +#ifdef ROBIN_HOOD_TRACE_ENABLED +#include +#define ROBIN_HOOD_TRACE(...) \ + std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << __VA_ARGS__ \ + << std::endl; +#else +#define ROBIN_HOOD_TRACE(x) +#endif + +// #define ROBIN_HOOD_COUNT_ENABLED +#ifdef ROBIN_HOOD_COUNT_ENABLED +#include +#define ROBIN_HOOD_COUNT(x) ++counts().x; +namespace robin_hood { +struct Counts { + uint64_t shiftUp{}; + uint64_t shiftDown{}; +}; +inline std::ostream &operator<<(std::ostream &os, Counts const &c) { + return os << c.shiftUp << " shiftUp" << std::endl + << c.shiftDown << " shiftDown" << std::endl; +} + +static Counts &counts() { + static Counts counts{}; + return counts; +} +} // namespace robin_hood +#else +#define ROBIN_HOOD_COUNT(x) +#endif + +// all non-argument macros should use this facility. See +// https://www.fluentcpp.com/2019/05/28/better-macros-better-flags/ +#define ROBIN_HOOD(x) ROBIN_HOOD_PRIVATE_DEFINITION_##x() + +// mark unused members with this macro +#define ROBIN_HOOD_UNUSED(identifier) + +// bitness +#if SIZE_MAX == UINT32_MAX +#define ROBIN_HOOD_PRIVATE_DEFINITION_BITNESS() 32 +#elif SIZE_MAX == UINT64_MAX +#define ROBIN_HOOD_PRIVATE_DEFINITION_BITNESS() 64 +#else +#error Unsupported bitness +#endif + +// endianess +#ifdef _MSC_VER +#define ROBIN_HOOD_PRIVATE_DEFINITION_LITTLE_ENDIAN() 1 +#define ROBIN_HOOD_PRIVATE_DEFINITION_BIG_ENDIAN() 0 +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_LITTLE_ENDIAN() \ + (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define ROBIN_HOOD_PRIVATE_DEFINITION_BIG_ENDIAN() \ + (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#endif + +// inline +#ifdef _MSC_VER +#define ROBIN_HOOD_PRIVATE_DEFINITION_NOINLINE() __declspec(noinline) +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_NOINLINE() __attribute__((noinline)) +#endif + +// exceptions +#if !defined(__cpp_exceptions) && !defined(__EXCEPTIONS) && !defined(_CPPUNWIND) +#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_EXCEPTIONS() 0 +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_EXCEPTIONS() 1 +#endif + +// count leading/trailing bits +#if !defined(ROBIN_HOOD_DISABLE_INTRINSICS) +#ifdef _MSC_VER +#if ROBIN_HOOD(BITNESS) == 32 +#define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward64 +#endif +#include +#pragma intrinsic(ROBIN_HOOD(BITSCANFORWARD)) +#define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x) \ + [](size_t mask) noexcept->int { \ + unsigned long index; \ + return ROBIN_HOOD(BITSCANFORWARD)(&index, mask) ? static_cast(index) \ + : ROBIN_HOOD(BITNESS); \ + } \ + (x) +#else +#if ROBIN_HOOD(BITNESS) == 32 +#define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzl +#define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzl +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzll +#define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzll +#endif +#define ROBIN_HOOD_COUNT_LEADING_ZEROES(x) \ + ((x) ? ROBIN_HOOD(CLZ)(x) : ROBIN_HOOD(BITNESS)) +#define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x) \ + ((x) ? ROBIN_HOOD(CTZ)(x) : ROBIN_HOOD(BITNESS)) +#endif +#endif + +// fallthrough +#ifndef __has_cpp_attribute // For backwards compatibility +#define __has_cpp_attribute(x) 0 +#endif +#if __has_cpp_attribute(clang::fallthrough) +#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() [[clang::fallthrough]] +#elif __has_cpp_attribute(gnu::fallthrough) +#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() [[gnu::fallthrough]] +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() +#endif + +// likely/unlikely +#ifdef _MSC_VER +#define ROBIN_HOOD_LIKELY(condition) condition +#define ROBIN_HOOD_UNLIKELY(condition) condition +#else +#define ROBIN_HOOD_LIKELY(condition) __builtin_expect(condition, 1) +#define ROBIN_HOOD_UNLIKELY(condition) __builtin_expect(condition, 0) +#endif + +// detect if native wchar_t type is availiable in MSVC +#ifdef _MSC_VER +#ifdef _NATIVE_WCHAR_T_DEFINED +#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1 +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 0 +#endif +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1 +#endif + +// detect if MSVC supports the pair(std::piecewise_construct_t,...) consructor +// being constexpr +#ifdef _MSC_VER +#if _MSC_VER <= 1900 +#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 1 +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 0 +#endif +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 0 +#endif + +// workaround missing "is_trivially_copyable" in g++ < 5.0 +// See https://stackoverflow.com/a/31798726/48181 +#if defined(__GNUC__) && __GNUC__ < 5 +#define ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__) +#else +#define ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(...) \ + std::is_trivially_copyable<__VA_ARGS__>::value +#endif + +// helpers for C++ versions, see +// https://gcc.gnu.org/onlinedocs/cpp/Standard-Predefined-Macros.html +#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX() __cplusplus +#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX98() 199711L +#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX11() 201103L +#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX14() 201402L +#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX17() 201703L + +#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX17) +#define ROBIN_HOOD_PRIVATE_DEFINITION_NODISCARD() [[nodiscard]] +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_NODISCARD() +#endif + +namespace robin_hood { + +#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX14) +#define ROBIN_HOOD_STD std +#else + +// c++11 compatibility layer +namespace ROBIN_HOOD_STD { +template +struct alignment_of + : std::integral_constant< + std::size_t, alignof(typename std::remove_all_extents::type)> {}; + +template +class integer_sequence { + public: + using value_type = T; + static_assert(std::is_integral::value, "not integral type"); + static constexpr std::size_t size() noexcept { return sizeof...(Ints); } +}; +template +using index_sequence = integer_sequence; + +namespace detail_ { +template +struct IntSeqImpl { + using TValue = T; + static_assert(std::is_integral::value, "not integral type"); + static_assert(Begin >= 0 && Begin < End, + "unexpected argument (Begin<0 || Begin<=End)"); + + template + struct IntSeqCombiner; + + template + struct IntSeqCombiner, + integer_sequence> { + using TResult = integer_sequence; + }; + + using TResult = typename IntSeqCombiner< + typename IntSeqImpl::TResult, + typename IntSeqImpl::TResult>::TResult; +}; + +template +struct IntSeqImpl { + using TValue = T; + static_assert(std::is_integral::value, "not integral type"); + static_assert(Begin >= 0, "unexpected argument (Begin<0)"); + using TResult = integer_sequence; +}; + +template +struct IntSeqImpl { + using TValue = T; + static_assert(std::is_integral::value, "not integral type"); + static_assert(Begin >= 0, "unexpected argument (Begin<0)"); + using TResult = integer_sequence; +}; +} // namespace detail_ + +template +using make_integer_sequence = + typename detail_::IntSeqImpl::TResult; + +template +using make_index_sequence = make_integer_sequence; + +template +using index_sequence_for = make_index_sequence; + +} // namespace ROBIN_HOOD_STD + +#endif + +namespace detail { + +// make sure we static_cast to the correct type for hash_int +#if ROBIN_HOOD(BITNESS) == 64 +using SizeT = uint64_t; +#else +using SizeT = uint32_t; +#endif + +template +T rotr(T x, unsigned k) { + return (x >> k) | (x << (8U * sizeof(T) - k)); +} + +// This cast gets rid of warnings like "cast from 'uint8_t*' {aka 'unsigned +// char*'} to +// 'uint64_t*' {aka 'long unsigned int*'} increases required alignment of target +// type". Use with +// care! +template +inline T reinterpret_cast_no_cast_align_warning(void *ptr) noexcept { + return reinterpret_cast(ptr); +} + +template +inline T reinterpret_cast_no_cast_align_warning(void const *ptr) noexcept { + return reinterpret_cast(ptr); +} + +// make sure this is not inlined as it is slow and dramatically enlarges code, +// thus making other +// inlinings more difficult. Throws are also generally the slow path. +template +[[noreturn]] ROBIN_HOOD(NOINLINE) +#if ROBIN_HOOD(HAS_EXCEPTIONS) + void doThrow(Args &&... args) { + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay) + throw E(std::forward(args)...); +} +#else + void doThrow(Args &&... ROBIN_HOOD_UNUSED(args) /*unused*/) { + abort(); +} +#endif + +template +T *assertNotNull(T *t, Args &&... args) { + if (ROBIN_HOOD_UNLIKELY(nullptr == t)) { + doThrow(std::forward(args)...); + } + return t; +} + +template +inline T unaligned_load(void const *ptr) noexcept { + // using memcpy so we don't get into unaligned load problems. + // compiler should optimize this very well anyways. + T t; + std::memcpy(&t, ptr, sizeof(T)); + return t; +} + +// Allocates bulks of memory for objects of type T. This deallocates the memory +// in the destructor, +// and keeps a linked list of the allocated memory around. Overhead per +// allocation is the size of a +// pointer. +template +class BulkPoolAllocator { + public: + BulkPoolAllocator() noexcept = default; + + // does not copy anything, just creates a new allocator. + BulkPoolAllocator(const BulkPoolAllocator &ROBIN_HOOD_UNUSED( + o) /*unused*/) noexcept : mHead(nullptr), + mListForFree(nullptr) {} + + BulkPoolAllocator(BulkPoolAllocator &&o) noexcept + : mHead(o.mHead), + mListForFree(o.mListForFree) { + o.mListForFree = nullptr; + o.mHead = nullptr; + } + + BulkPoolAllocator &operator=(BulkPoolAllocator &&o) noexcept { + reset(); + mHead = o.mHead; + mListForFree = o.mListForFree; + o.mListForFree = nullptr; + o.mHead = nullptr; + return *this; + } + + BulkPoolAllocator & + // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp) + operator=(const BulkPoolAllocator &ROBIN_HOOD_UNUSED(o) /*unused*/) noexcept { + // does not do anything + return *this; + } + + ~BulkPoolAllocator() noexcept { reset(); } + + // Deallocates all allocated memory. + void reset() noexcept { + while (mListForFree) { + T *tmp = *mListForFree; + ROBIN_HOOD_LOG("std::free") + std::free(mListForFree); + mListForFree = reinterpret_cast_no_cast_align_warning(tmp); + } + mHead = nullptr; + } + + // allocates, but does NOT initialize. Use in-place new constructor, e.g. + // T* obj = pool.allocate(); + // ::new (static_cast(obj)) T(); + T *allocate() { + T *tmp = mHead; + if (!tmp) { + tmp = performAllocation(); + } + + mHead = *reinterpret_cast_no_cast_align_warning(tmp); + return tmp; + } + + // does not actually deallocate but puts it in store. + // make sure you have already called the destructor! e.g. with + // obj->~T(); + // pool.deallocate(obj); + void deallocate(T *obj) noexcept { + *reinterpret_cast_no_cast_align_warning(obj) = mHead; + mHead = obj; + } + + // Adds an already allocated block of memory to the allocator. This allocator + // is from now on + // responsible for freeing the data (with free()). If the provided data is not + // large enough to + // make use of, it is immediately freed. Otherwise it is reused and freed in + // the destructor. + void addOrFree(void *ptr, const size_t numBytes) noexcept { + // calculate number of available elements in ptr + if (numBytes < ALIGNMENT + ALIGNED_SIZE) { + // not enough data for at least one element. Free and return. + ROBIN_HOOD_LOG("std::free") + std::free(ptr); + } else { + ROBIN_HOOD_LOG("add to buffer") + add(ptr, numBytes); + } + } + + void swap(BulkPoolAllocator &other) noexcept { + using std::swap; + swap(mHead, other.mHead); + swap(mListForFree, other.mListForFree); + } + + private: + // iterates the list of allocated memory to calculate how many to alloc next. + // Recalculating this each time saves us a size_t member. + // This ignores the fact that memory blocks might have been added manually + // with addOrFree. In + // practice, this should not matter much. + ROBIN_HOOD(NODISCARD) size_t calcNumElementsToAlloc() const noexcept { + auto tmp = mListForFree; + size_t numAllocs = MinNumAllocs; + + while (numAllocs * 2 <= MaxNumAllocs && tmp) { + auto x = reinterpret_cast(tmp); + tmp = *x; + numAllocs *= 2; + } + + return numAllocs; + } + + // WARNING: Underflow if numBytes < ALIGNMENT! This is guarded in addOrFree(). + void add(void *ptr, const size_t numBytes) noexcept { + const size_t numElements = (numBytes - ALIGNMENT) / ALIGNED_SIZE; + + auto data = reinterpret_cast(ptr); + + // link free list + auto x = reinterpret_cast(data); + *x = mListForFree; + mListForFree = data; + + // create linked list for newly allocated data + auto *const headT = reinterpret_cast_no_cast_align_warning( + reinterpret_cast(ptr) + ALIGNMENT); + + auto *const head = reinterpret_cast(headT); + + // Visual Studio compiler automatically unrolls this loop, which is pretty + // cool + for (size_t i = 0; i < numElements; ++i) { + *reinterpret_cast_no_cast_align_warning( + head + i * ALIGNED_SIZE) = head + (i + 1) * ALIGNED_SIZE; + } + + // last one points to 0 + *reinterpret_cast_no_cast_align_warning( + head + (numElements - 1) * ALIGNED_SIZE) = mHead; + mHead = headT; + } + + // Called when no memory is available (mHead == 0). + // Don't inline this slow path. + ROBIN_HOOD(NOINLINE) T *performAllocation() { + size_t const numElementsToAlloc = calcNumElementsToAlloc(); + + // alloc new memory: [prev |T, T, ... T] + size_t const bytes = ALIGNMENT + ALIGNED_SIZE * numElementsToAlloc; + ROBIN_HOOD_LOG("std::malloc " << bytes << " = " << ALIGNMENT << " + " + << ALIGNED_SIZE << " * " + << numElementsToAlloc) + add(assertNotNull(std::malloc(bytes)), bytes); + return mHead; + } + +// enforce byte alignment of the T's +#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX14) + static constexpr size_t ALIGNMENT = + (std::max)(std::alignment_of::value, std::alignment_of::value); +#else + static const size_t ALIGNMENT = + (ROBIN_HOOD_STD::alignment_of::value > + ROBIN_HOOD_STD::alignment_of::value) + ? ROBIN_HOOD_STD::alignment_of::value + : +ROBIN_HOOD_STD::alignment_of::value; // the + is for + // walkarround +#endif + + static constexpr size_t ALIGNED_SIZE = + ((sizeof(T) - 1) / ALIGNMENT + 1) * ALIGNMENT; + + static_assert(MinNumAllocs >= 1, "MinNumAllocs"); + static_assert(MaxNumAllocs >= MinNumAllocs, "MaxNumAllocs"); + static_assert(ALIGNED_SIZE >= sizeof(T *), "ALIGNED_SIZE"); + static_assert(0 == (ALIGNED_SIZE % sizeof(T *)), "ALIGNED_SIZE mod"); + static_assert(ALIGNMENT >= sizeof(T *), "ALIGNMENT"); + + T *mHead{nullptr}; + T **mListForFree{nullptr}; +}; + +template +struct NodeAllocator; + +// dummy allocator that does nothing +template +struct NodeAllocator { + // we are not using the data, so just free it. + void addOrFree(void *ptr, + size_t ROBIN_HOOD_UNUSED(numBytes) /*unused*/) noexcept { + ROBIN_HOOD_LOG("std::free") + std::free(ptr); + } +}; + +template +struct NodeAllocator + : public BulkPoolAllocator {}; + +// c++14 doesn't have is_nothrow_swappable, and clang++ 6.0.1 doesn't like it +// either, so I'm making +// my own here. +namespace swappable { +#if ROBIN_HOOD(CXX) < ROBIN_HOOD(CXX17) +using std::swap; +template +struct nothrow { + static const bool value = + noexcept(swap(std::declval(), std::declval())); +}; +#else +template +struct nothrow { + static const bool value = std::is_nothrow_swappable::value; +}; +#endif +} // namespace swappable + +} // namespace detail + +struct is_transparent_tag {}; + +// A custom pair implementation is used in the map because std::pair is not +// is_trivially_copyable, +// which means it would not be allowed to be used in std::memcpy. This struct +// is copyable, which is +// also tested. +template +struct pair { + using first_type = T1; + using second_type = T2; + + template ::value && + std::is_default_constructible::value>::type> + constexpr pair() noexcept(noexcept(U1()) && noexcept(U2())) + : first(), second() {} + + // pair constructors are explicit so we don't accidentally call this ctor when + // we don't have to. + explicit constexpr pair(std::pair const &o) noexcept( + noexcept(T1(std::declval())) && + noexcept(T2(std::declval()))) + : first(o.first), second(o.second) {} + + // pair constructors are explicit so we don't accidentally call this ctor when + // we don't have to. + explicit constexpr pair(std::pair &&o) noexcept( + noexcept(T1(std::move(std::declval()))) && + noexcept(T2(std::move(std::declval())))) + : first(std::move(o.first)), second(std::move(o.second)) {} + + constexpr pair(T1 &&a, T2 &&b) noexcept( + noexcept(T1(std::move(std::declval()))) && + noexcept(T2(std::move(std::declval())))) + : first(std::move(a)), second(std::move(b)) {} + + template + constexpr pair(U1 &&a, U2 &&b) noexcept( + noexcept(T1(std::forward(std::declval()))) && + noexcept(T2(std::forward(std::declval())))) + : first(std::forward(a)), second(std::forward(b)) {} + + template +// MSVC 2015 produces error "C2476: ‘constexpr’ constructor does not initialize +// all members" +// if this constructor is constexpr +#if !ROBIN_HOOD(BROKEN_CONSTEXPR) + constexpr +#endif + pair(std::piecewise_construct_t /*unused*/, std::tuple a, + std::tuple + b) noexcept(noexcept(pair(std::declval &>(), + std::declval &>(), + ROBIN_HOOD_STD::index_sequence_for< + U1...>(), + ROBIN_HOOD_STD::index_sequence_for< + U2...>()))) + : pair(a, b, ROBIN_HOOD_STD::index_sequence_for(), + ROBIN_HOOD_STD::index_sequence_for()) { + } + + // constructor called from the std::piecewise_construct_t ctor + template + pair( + std::tuple &a, std::tuple &b, + ROBIN_HOOD_STD::index_sequence /*unused*/, + ROBIN_HOOD_STD::index_sequence< + I2...> /*unused*/) noexcept(noexcept(T1(std:: + forward(std::get( + std::declval< + std::tuple + &>()))...)) && + noexcept(T2(std::forward(std::get( + std::declval< + std::tuple &>()))...))) + : first(std::forward(std::get(a))...), + second(std::forward(std::get(b))...) { + // make visual studio compiler happy about warning about unused a & b. + // Visual studio's pair implementation disables warning 4100. + (void)a; + (void)b; + } + + void swap(pair &o) noexcept((detail::swappable::nothrow::value) && + (detail::swappable::nothrow::value)) { + using std::swap; + swap(first, o.first); + swap(second, o.second); + } + + T1 first; // NOLINT(misc-non-private-member-variables-in-classes) + T2 second; // NOLINT(misc-non-private-member-variables-in-classes) +}; + +template +inline void swap(pair &a, pair &b) noexcept( + noexcept(std::declval &>().swap(std::declval &>()))) { + a.swap(b); +} + +template +inline constexpr bool operator==(pair const &x, pair const &y) { + return (x.first == y.first) && (x.second == y.second); +} +template +inline constexpr bool operator!=(pair const &x, pair const &y) { + return !(x == y); +} +template +inline constexpr bool +operator<(pair const &x, pair const &y) noexcept( + noexcept(std::declval() < std::declval()) && + noexcept(std::declval() < std::declval())) { + return x.first < y.first || (!(y.first < x.first) && x.second < y.second); +} +template +inline constexpr bool operator>(pair const &x, pair const &y) { + return y < x; +} +template +inline constexpr bool operator<=(pair const &x, pair const &y) { + return !(x > y); +} +template +inline constexpr bool operator>=(pair const &x, pair const &y) { + return !(x < y); +} + +inline size_t hash_bytes(void const *ptr, size_t len) noexcept { + static constexpr uint64_t m = UINT64_C(0xc6a4a7935bd1e995); + static constexpr uint64_t seed = UINT64_C(0xe17a1465); + static constexpr unsigned int r = 47; + + auto const *const data64 = static_cast(ptr); + uint64_t h = seed ^ (len * m); + + size_t const n_blocks = len / 8; + for (size_t i = 0; i < n_blocks; ++i) { + auto k = detail::unaligned_load(data64 + i); + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + auto const *const data8 = + reinterpret_cast(data64 + n_blocks); + switch (len & 7U) { + case 7: + h ^= static_cast(data8[6]) << 48U; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + case 6: + h ^= static_cast(data8[5]) << 40U; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + case 5: + h ^= static_cast(data8[4]) << 32U; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + case 4: + h ^= static_cast(data8[3]) << 24U; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + case 3: + h ^= static_cast(data8[2]) << 16U; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + case 2: + h ^= static_cast(data8[1]) << 8U; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + case 1: + h ^= static_cast(data8[0]); + h *= m; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + default: + break; + } + + h ^= h >> r; + + // not doing the final step here, because this will be done by keyToIdx + // anyways + // h *= m; + // h ^= h >> r; + return static_cast(h); +} + +inline size_t hash_int(uint64_t x) noexcept { + // tried lots of different hashes, let's stick with murmurhash3. It's simple, + // fast, well tested, + // and doesn't need any special 128bit operations. + x ^= x >> 33U; + x *= UINT64_C(0xff51afd7ed558ccd); + x ^= x >> 33U; + + // not doing the final step here, because this will be done by keyToIdx + // anyways + // x *= UINT64_C(0xc4ceb9fe1a85ec53); + // x ^= x >> 33U; + return static_cast(x); +} + +// A thin wrapper around std::hash, performing an additional simple mixing step +// of the result. +template +struct hash : public std::hash { + size_t operator()(T const &obj) const noexcept(noexcept( + std::declval>().operator()(std::declval()))) { + // call base hash + auto result = std::hash::operator()(obj); + // return mixed of that, to be save against identity has + return hash_int(static_cast(result)); + } +}; + +template +struct hash> { + size_t operator()(std::basic_string const &str) const noexcept { + return hash_bytes(str.data(), sizeof(CharT) * str.size()); + } +}; + +#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX17) +template +struct hash> { + size_t operator()(std::basic_string_view const &sv) const noexcept { + return hash_bytes(sv.data(), sizeof(CharT) * sv.size()); + } +}; +#endif + +template +struct hash { + size_t operator()(T *ptr) const noexcept { + return hash_int(reinterpret_cast(ptr)); + } +}; + +template +struct hash> { + size_t operator()(std::unique_ptr const &ptr) const noexcept { + return hash_int(reinterpret_cast(ptr.get())); + } +}; + +template +struct hash> { + size_t operator()(std::shared_ptr const &ptr) const noexcept { + return hash_int(reinterpret_cast(ptr.get())); + } +}; + +template +struct hash::value>::type> { + size_t operator()(Enum e) const noexcept { + using Underlying = typename std::underlying_type::type; + return hash{}(static_cast(e)); + } +}; + +#define ROBIN_HOOD_HASH_INT(T) \ + template <> \ + struct hash { \ + size_t operator()(T const &obj) const noexcept { \ + return hash_int(static_cast(obj)); \ + } \ + } + +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuseless-cast" +#endif +// see https://en.cppreference.com/w/cpp/utility/hash +ROBIN_HOOD_HASH_INT(bool); +ROBIN_HOOD_HASH_INT(char); +ROBIN_HOOD_HASH_INT(signed char); +ROBIN_HOOD_HASH_INT(unsigned char); +ROBIN_HOOD_HASH_INT(char16_t); +ROBIN_HOOD_HASH_INT(char32_t); +#if ROBIN_HOOD(HAS_NATIVE_WCHART) +ROBIN_HOOD_HASH_INT(wchar_t); +#endif +ROBIN_HOOD_HASH_INT(short); +ROBIN_HOOD_HASH_INT(unsigned short); +ROBIN_HOOD_HASH_INT(int); +ROBIN_HOOD_HASH_INT(unsigned int); +ROBIN_HOOD_HASH_INT(long); +ROBIN_HOOD_HASH_INT(long long); +ROBIN_HOOD_HASH_INT(unsigned long); +ROBIN_HOOD_HASH_INT(unsigned long long); +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic pop +#endif +namespace detail { + +template +struct void_type { + using type = void; +}; + +template +struct has_is_transparent : public std::false_type {}; + +template +struct has_is_transparent::type> + : public std::true_type {}; + +// using wrapper classes for hash and key_equal prevents the diamond problem +// when the same type +// is used. see https://stackoverflow.com/a/28771920/48181 +template +struct WrapHash : public T { + WrapHash() = default; + explicit WrapHash(T const &o) noexcept(noexcept(T(std::declval()))) + : T(o) {} +}; + +template +struct WrapKeyEqual : public T { + WrapKeyEqual() = default; + explicit WrapKeyEqual(T const &o) noexcept( + noexcept(T(std::declval()))) + : T(o) {} +}; + +// A highly optimized hashmap implementation, using the Robin Hood algorithm. +// +// In most cases, this map should be usable as a drop-in replacement for +// std::unordered_map, but +// be about 2x faster in most cases and require much less allocations. +// +// This implementation uses the following memory layout: +// +// [Node, Node, ... Node | info, info, ... infoSentinel ] +// +// * Node: either a DataNode that directly has the std::pair as +// member, +// or a DataNode with a pointer to std::pair. Which DataNode +// representation to use +// depends on how fast the swap() operation is. Heuristically, this is +// automatically choosen +// based on sizeof(). there are always 2^n Nodes. +// +// * info: Each Node in the map has a corresponding info byte, so there are 2^n +// info bytes. +// Each byte is initialized to 0, meaning the corresponding Node is empty. Set +// to 1 means the +// corresponding node contains data. Set to 2 means the corresponding Node is +// filled, but it +// actually belongs to the previous position and was pushed out because that +// place is already +// taken. +// +// * infoSentinel: Sentinel byte set to 1, so that iterator's ++ can stop at +// end() without the +// need for a idx variable. +// +// According to STL, order of templates has effect on throughput. That's why +// I've moved the +// boolean to the front. +// https://www.reddit.com/r/cpp/comments/ahp6iu/compile_time_binary_size_reductions_and_cs_future/eeguck4/ +template +class Table + : public WrapHash, + public WrapKeyEqual, + detail::NodeAllocator< + typename std::conditional< + std::is_void::value, Key, + robin_hood::pair< + typename std::conditional::type, + T>>::type, + 4, 16384, IsFlat> { + public: + static constexpr bool is_flat = IsFlat; + static constexpr bool is_map = !std::is_void::value; + static constexpr bool is_set = !is_map; + static constexpr bool is_transparent = + has_is_transparent::value && has_is_transparent::value; + + using key_type = Key; + using mapped_type = T; + using value_type = typename std::conditional< + is_set, Key, + robin_hood::pair::type, + T>>::type; + using size_type = size_t; + using hasher = Hash; + using key_equal = KeyEqual; + using Self = + Table; + + private: + static_assert(MaxLoadFactor100 > 10 && MaxLoadFactor100 < 100, + "MaxLoadFactor100 needs to be >10 && < 100"); + + using WHash = WrapHash; + using WKeyEqual = WrapKeyEqual; + + // configuration defaults + + // make sure we have 8 elements, needed to quickly rehash mInfo + static constexpr size_t InitialNumElements = sizeof(uint64_t); + static constexpr uint32_t InitialInfoNumBits = 5; + static constexpr uint8_t InitialInfoInc = 1U << InitialInfoNumBits; + static constexpr size_t InfoMask = InitialInfoInc - 1U; + static constexpr uint8_t InitialInfoHashShift = 0; + using DataPool = detail::NodeAllocator; + + // type needs to be wider than uint8_t. + using InfoType = uint32_t; + + // DataNode //////////////////////////////////////////////////////// + + // Primary template for the data node. We have special implementations for + // small and big + // objects. For large objects it is assumed that swap() is fairly slow, so we + // allocate these + // on the heap so swap merely swaps a pointer. + template + class DataNode {}; + + // Small: just allocate on the stack. + template + class DataNode final { + public: + template + explicit DataNode( + M &ROBIN_HOOD_UNUSED(map) /*unused*/, + Args &&... args) noexcept(noexcept(value_type(std:: + forward( + args)...))) + : mData(std::forward(args)...) {} + + DataNode( + M &ROBIN_HOOD_UNUSED(map) /*unused*/, + DataNode + &&n) noexcept(std::is_nothrow_move_constructible::value) + : mData(std::move(n.mData)) {} + + // doesn't do anything + void destroy(M &ROBIN_HOOD_UNUSED(map) /*unused*/) noexcept {} + void destroyDoNotDeallocate() noexcept {} + + value_type const *operator->() const noexcept { return &mData; } + value_type *operator->() noexcept { return &mData; } + + const value_type &operator*() const noexcept { return mData; } + + value_type &operator*() noexcept { return mData; } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type + getFirst() noexcept { + return mData.first; + } + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getFirst() noexcept { + return mData; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type + getFirst() const noexcept { + return mData.first; + } + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getFirst() const + noexcept { + return mData; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getSecond() noexcept { + return mData.second; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getSecond() const + noexcept { + return mData.second; + } + + void swap(DataNode &o) noexcept( + noexcept(std::declval().swap(std::declval()))) { + mData.swap(o.mData); + } + + private: + value_type mData; + }; + + // big object: allocate on heap. + template + class DataNode { + public: + template + explicit DataNode(M &map, Args &&... args) : mData(map.allocate()) { + ::new (static_cast(mData)) + value_type(std::forward(args)...); + } + + DataNode(M &ROBIN_HOOD_UNUSED(map) /*unused*/, + DataNode &&n) noexcept : mData(std::move(n.mData)) {} + + void destroy(M &map) noexcept { + // don't deallocate, just put it into list of datapool. + mData->~value_type(); + map.deallocate(mData); + } + + void destroyDoNotDeallocate() noexcept { mData->~value_type(); } + + value_type const *operator->() const noexcept { return mData; } + + value_type *operator->() noexcept { return mData; } + + const value_type &operator*() const { return *mData; } + + value_type &operator*() { return *mData; } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type + getFirst() noexcept { + return mData->first; + } + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getFirst() noexcept { + return *mData; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type + getFirst() const noexcept { + return mData->first; + } + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getFirst() const + noexcept { + return *mData; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getSecond() noexcept { + return mData->second; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getSecond() const + noexcept { + return mData->second; + } + + void swap(DataNode &o) noexcept { + using std::swap; + swap(mData, o.mData); + } + + private: + value_type *mData; + }; + + using Node = DataNode; + + // helpers for insertKeyPrepareEmptySpot: extract first entry (only const + // required) + ROBIN_HOOD(NODISCARD) + key_type const &getFirstConst(Node const &n) const noexcept { + return n.getFirst(); + } + + // in case we have void mapped_type, we are not using a pair, thus we just + // route k through. + // No need to disable this because it's just not used if not applicable. + ROBIN_HOOD(NODISCARD) + key_type const &getFirstConst(key_type const &k) const noexcept { return k; } + + // in case we have non-void mapped_type, we have a standard robin_hood::pair + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::value, key_type const &>::type + getFirstConst(value_type const &vt) const noexcept { + return vt.first; + } + + // Cloner ////////////////////////////////////////////////////////// + + template + struct Cloner; + + // fast path: Just copy data, without allocating anything. + template + struct Cloner { + void operator()(M const &source, M &target) const { + auto const *const src = reinterpret_cast(source.mKeyVals); + auto *tgt = reinterpret_cast(target.mKeyVals); + auto const numElementsWithBuffer = + target.calcNumElementsWithBuffer(target.mMask + 1); + std::copy(src, src + target.calcNumBytesTotal(numElementsWithBuffer), + tgt); + } + }; + + template + struct Cloner { + void operator()(M const &s, M &t) const { + auto const numElementsWithBuffer = + t.calcNumElementsWithBuffer(t.mMask + 1); + std::copy(s.mInfo, s.mInfo + t.calcNumBytesInfo(numElementsWithBuffer), + t.mInfo); + + for (size_t i = 0; i < numElementsWithBuffer; ++i) { + if (t.mInfo[i]) { + ::new (static_cast(t.mKeyVals + i)) Node(t, *s.mKeyVals[i]); + } + } + } + }; + + // Destroyer /////////////////////////////////////////////////////// + + template + struct Destroyer {}; + + template + struct Destroyer { + void nodes(M &m) const noexcept { m.mNumElements = 0; } + + void nodesDoNotDeallocate(M &m) const noexcept { m.mNumElements = 0; } + }; + + template + struct Destroyer { + void nodes(M &m) const noexcept { + m.mNumElements = 0; + // clear also resets mInfo to 0, that's sometimes not necessary. + auto const numElementsWithBuffer = + m.calcNumElementsWithBuffer(m.mMask + 1); + + for (size_t idx = 0; idx < numElementsWithBuffer; ++idx) { + if (0 != m.mInfo[idx]) { + Node &n = m.mKeyVals[idx]; + n.destroy(m); + n.~Node(); + } + } + } + + void nodesDoNotDeallocate(M &m) const noexcept { + m.mNumElements = 0; + // clear also resets mInfo to 0, that's sometimes not necessary. + auto const numElementsWithBuffer = + m.calcNumElementsWithBuffer(m.mMask + 1); + for (size_t idx = 0; idx < numElementsWithBuffer; ++idx) { + if (0 != m.mInfo[idx]) { + Node &n = m.mKeyVals[idx]; + n.destroyDoNotDeallocate(); + n.~Node(); + } + } + } + }; + + // Iter //////////////////////////////////////////////////////////// + + struct fast_forward_tag {}; + + // generic iterator for both const_iterator and iterator. + template + // NOLINTNEXTLINE(hicpp-special-member-functions,cppcoreguidelines-special-member-functions) + class Iter { + private: + using NodePtr = + typename std::conditional::type; + + public: + using difference_type = std::ptrdiff_t; + using value_type = typename Self::value_type; + using reference = typename std::conditional::type; + using pointer = typename std::conditional::type; + using iterator_category = std::forward_iterator_tag; + + // default constructed iterator can be compared to itself, but WON'T return + // true when + // compared to end(). + Iter() = default; + + // Rule of zero: nothing specified. The conversion constructor is only + // enabled for + // iterator to const_iterator, so it doesn't accidentally work as a copy + // ctor. + + // Conversion constructor from iterator to const_iterator. + template ::type> + // NOLINTNEXTLINE(hicpp-explicit-conversions) + Iter(Iter const &other) noexcept : mKeyVals(other.mKeyVals), + mInfo(other.mInfo) {} + + Iter(NodePtr valPtr, uint8_t const *infoPtr) noexcept : mKeyVals(valPtr), + mInfo(infoPtr) {} + + Iter(NodePtr valPtr, uint8_t const *infoPtr, + fast_forward_tag ROBIN_HOOD_UNUSED(tag) /*unused*/) noexcept + : mKeyVals(valPtr), + mInfo(infoPtr) { + fastForward(); + } + + template ::type> + Iter &operator=(Iter const &other) noexcept { + mKeyVals = other.mKeyVals; + mInfo = other.mInfo; + return *this; + } + + // prefix increment. Undefined behavior if we are at end()! + Iter &operator++() noexcept { + mInfo++; + mKeyVals++; + fastForward(); + return *this; + } + + Iter operator++(int)noexcept { + Iter tmp = *this; + ++(*this); + return tmp; + } + + reference operator*() const { return **mKeyVals; } + + pointer operator->() const { return &**mKeyVals; } + + template + bool operator==(Iter const &o) const noexcept { + return mKeyVals == o.mKeyVals; + } + + template + bool operator!=(Iter const &o) const noexcept { + return mKeyVals != o.mKeyVals; + } + + private: + // fast forward to the next non-free info byte + // I've tried a few variants that don't depend on intrinsics, but + // unfortunately they are + // quite a bit slower than this one. So I've reverted that change again. See + // map_benchmark. + void fastForward() noexcept { + size_t n = 0; + while (0U == (n = detail::unaligned_load(mInfo))) { + mInfo += sizeof(size_t); + mKeyVals += sizeof(size_t); + } +#if defined(ROBIN_HOOD_DISABLE_INTRINSICS) + // we know for certain that within the next 8 bytes we'll find a non-zero + // one. + if (ROBIN_HOOD_UNLIKELY(0U == detail::unaligned_load(mInfo))) { + mInfo += 4; + mKeyVals += 4; + } + if (ROBIN_HOOD_UNLIKELY(0U == detail::unaligned_load(mInfo))) { + mInfo += 2; + mKeyVals += 2; + } + if (ROBIN_HOOD_UNLIKELY(0U == *mInfo)) { + mInfo += 1; + mKeyVals += 1; + } +#else +#if ROBIN_HOOD(LITTLE_ENDIAN) + auto inc = ROBIN_HOOD_COUNT_TRAILING_ZEROES(n) / 8; +#else + auto inc = ROBIN_HOOD_COUNT_LEADING_ZEROES(n) / 8; +#endif + mInfo += inc; + mKeyVals += inc; +#endif + } + + friend class Table; + NodePtr mKeyVals{nullptr}; + uint8_t const *mInfo{nullptr}; + }; + + //////////////////////////////////////////////////////////////////// + + // highly performance relevant code. + // Lower bits are used for indexing into the array (2^n size) + // The upper 1-5 bits need to be a reasonable good hash, to save comparisons. + template + void keyToIdx(HashKey &&key, size_t *idx, InfoType *info) const { + // In addition to whatever hash is used, add another mul & shift so we get + // better hashing. + // This serves as a bad hash prevention, if the given data is + // badly mixed. + auto h = static_cast(WHash::operator()(key)); + + h *= mHashMultiplier; + h ^= h >> 33U; + + // the lower InitialInfoNumBits are reserved for info. + *info = mInfoInc + static_cast((h & InfoMask) >> mInfoHashShift); + *idx = (static_cast(h) >> InitialInfoNumBits) & mMask; + } + + // forwards the index by one, wrapping around at the end + void next(InfoType *info, size_t *idx) const noexcept { + *idx = *idx + 1; + *info += mInfoInc; + } + + void nextWhileLess(InfoType *info, size_t *idx) const noexcept { + // unrolling this by hand did not bring any speedups. + while (*info < mInfo[*idx]) { + next(info, idx); + } + } + + // Shift everything up by one element. Tries to move stuff around. + void shiftUp(size_t startIdx, size_t const insertion_idx) noexcept( + std::is_nothrow_move_assignable::value) { + auto idx = startIdx; + ::new (static_cast(mKeyVals + idx)) + Node(std::move(mKeyVals[idx - 1])); + while (--idx != insertion_idx) { + mKeyVals[idx] = std::move(mKeyVals[idx - 1]); + } + + idx = startIdx; + while (idx != insertion_idx) { + ROBIN_HOOD_COUNT(shiftUp) + mInfo[idx] = static_cast(mInfo[idx - 1] + mInfoInc); + if (ROBIN_HOOD_UNLIKELY(mInfo[idx] + mInfoInc > 0xFF)) { + mMaxNumElementsAllowed = 0; + } + --idx; + } + } + + void shiftDown(size_t idx) noexcept( + std::is_nothrow_move_assignable::value) { + // until we find one that is either empty or has zero offset. + // TODO(martinus) we don't need to move everything, just the last one for + // the same + // bucket. + mKeyVals[idx].destroy(*this); + + // until we find one that is either empty or has zero offset. + while (mInfo[idx + 1] >= 2 * mInfoInc) { + ROBIN_HOOD_COUNT(shiftDown) + mInfo[idx] = static_cast(mInfo[idx + 1] - mInfoInc); + mKeyVals[idx] = std::move(mKeyVals[idx + 1]); + ++idx; + } + + mInfo[idx] = 0; + // don't destroy, we've moved it + // mKeyVals[idx].destroy(*this); + mKeyVals[idx].~Node(); + } + + // copy of find(), except that it returns iterator instead of const_iterator. + template + ROBIN_HOOD(NODISCARD) + size_t findIdx(Other const &key) const { + size_t idx{}; + InfoType info{}; + keyToIdx(key, &idx, &info); + + do { + // unrolling this twice gives a bit of a speedup. More unrolling did not + // help. + if (info == mInfo[idx] && ROBIN_HOOD_LIKELY(WKeyEqual::operator()( + key, mKeyVals[idx].getFirst()))) { + return idx; + } + next(&info, &idx); + if (info == mInfo[idx] && ROBIN_HOOD_LIKELY(WKeyEqual::operator()( + key, mKeyVals[idx].getFirst()))) { + return idx; + } + next(&info, &idx); + } while (info <= mInfo[idx]); + + // nothing found! + return mMask == 0 + ? 0 + : static_cast(std::distance( + mKeyVals, + reinterpret_cast_no_cast_align_warning(mInfo))); + } + + void cloneData(const Table &o) { + Cloner()(o, *this); + } + + // inserts a keyval that is guaranteed to be new, e.g. when the hashmap is + // resized. + // @return True on success, false if something went wrong + void insert_move(Node &&keyval) { + // we don't retry, fail if overflowing + // don't need to check max num elements + if (0 == mMaxNumElementsAllowed && !try_increase_info()) { + throwOverflowError(); + } + + size_t idx{}; + InfoType info{}; + keyToIdx(keyval.getFirst(), &idx, &info); + + // skip forward. Use <= because we are certain that the element is not + // there. + while (info <= mInfo[idx]) { + idx = idx + 1; + info += mInfoInc; + } + + // key not found, so we are now exactly where we want to insert it. + auto const insertion_idx = idx; + auto const insertion_info = static_cast(info); + if (ROBIN_HOOD_UNLIKELY(insertion_info + mInfoInc > 0xFF)) { + mMaxNumElementsAllowed = 0; + } + + // find an empty spot + while (0 != mInfo[idx]) { + next(&info, &idx); + } + + auto &l = mKeyVals[insertion_idx]; + if (idx == insertion_idx) { + ::new (static_cast(&l)) Node(std::move(keyval)); + } else { + shiftUp(idx, insertion_idx); + l = std::move(keyval); + } + + // put at empty spot + mInfo[insertion_idx] = insertion_info; + + ++mNumElements; + } + + public: + using iterator = Iter; + using const_iterator = Iter; + + Table() noexcept(noexcept(Hash()) && noexcept(KeyEqual())) + : WHash(), WKeyEqual() { + ROBIN_HOOD_TRACE(this) + } + + // Creates an empty hash map. Nothing is allocated yet, this happens at the + // first insert. + // This tremendously speeds up ctor & dtor of a map that never receives an + // element. The + // penalty is payed at the first insert, and not before. Lookup of this empty + // map works + // because everybody points to DummyInfoByte::b. parameter bucket_count is + // dictated by the + // standard, but we can ignore it. + explicit Table( + size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/, const Hash &h = Hash{}, + const KeyEqual &equal = KeyEqual{}) noexcept(noexcept(Hash(h)) && + noexcept(KeyEqual(equal))) + : WHash(h), WKeyEqual(equal) { + ROBIN_HOOD_TRACE(this) + } + + template + Table(Iter first, Iter last, + size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/ = 0, + const Hash &h = Hash{}, const KeyEqual &equal = KeyEqual{}) + : WHash(h), WKeyEqual(equal) { + ROBIN_HOOD_TRACE(this) + insert(first, last); + } + + Table(std::initializer_list initlist, + size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/ = 0, + const Hash &h = Hash{}, const KeyEqual &equal = KeyEqual{}) + : WHash(h), WKeyEqual(equal) { + ROBIN_HOOD_TRACE(this) + insert(initlist.begin(), initlist.end()); + } + + Table(Table &&o) noexcept : WHash(std::move(static_cast(o))), + WKeyEqual(std::move(static_cast(o))), + DataPool(std::move(static_cast(o))) { + ROBIN_HOOD_TRACE(this) + if (o.mMask) { + mHashMultiplier = std::move(o.mHashMultiplier); + mKeyVals = std::move(o.mKeyVals); + mInfo = std::move(o.mInfo); + mNumElements = std::move(o.mNumElements); + mMask = std::move(o.mMask); + mMaxNumElementsAllowed = std::move(o.mMaxNumElementsAllowed); + mInfoInc = std::move(o.mInfoInc); + mInfoHashShift = std::move(o.mInfoHashShift); + // set other's mask to 0 so its destructor won't do anything + o.init(); + } + } + + Table &operator=(Table &&o) noexcept { + ROBIN_HOOD_TRACE(this) + if (&o != this) { + if (o.mMask) { + // only move stuff if the other map actually has some data + destroy(); + mHashMultiplier = std::move(o.mHashMultiplier); + mKeyVals = std::move(o.mKeyVals); + mInfo = std::move(o.mInfo); + mNumElements = std::move(o.mNumElements); + mMask = std::move(o.mMask); + mMaxNumElementsAllowed = std::move(o.mMaxNumElementsAllowed); + mInfoInc = std::move(o.mInfoInc); + mInfoHashShift = std::move(o.mInfoHashShift); + WHash::operator=(std::move(static_cast(o))); + WKeyEqual::operator=(std::move(static_cast(o))); + DataPool::operator=(std::move(static_cast(o))); + + o.init(); + + } else { + // nothing in the other map => just clear us. + clear(); + } + } + return *this; + } + + Table(const Table &o) + : WHash(static_cast(o)), + WKeyEqual(static_cast(o)), + DataPool(static_cast(o)) { + ROBIN_HOOD_TRACE(this) + if (!o.empty()) { + // not empty: create an exact copy. it is also possible to just iterate + // through all + // elements and insert them, but copying is probably faster. + + auto const numElementsWithBuffer = calcNumElementsWithBuffer(o.mMask + 1); + auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer); + + ROBIN_HOOD_LOG("std::malloc " << numBytesTotal << " = calcNumBytesTotal(" + << numElementsWithBuffer << ")") + mHashMultiplier = o.mHashMultiplier; + mKeyVals = static_cast( + detail::assertNotNull(std::malloc(numBytesTotal))); + // no need for calloc because clonData does memcpy + mInfo = reinterpret_cast(mKeyVals + numElementsWithBuffer); + mNumElements = o.mNumElements; + mMask = o.mMask; + mMaxNumElementsAllowed = o.mMaxNumElementsAllowed; + mInfoInc = o.mInfoInc; + mInfoHashShift = o.mInfoHashShift; + cloneData(o); + } + } + + // Creates a copy of the given map. Copy constructor of each entry is used. + // Not sure why clang-tidy thinks this doesn't handle self assignment, it does + // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp) + Table &operator=(Table const &o) { + ROBIN_HOOD_TRACE(this) + if (&o == this) { + // prevent assigning of itself + return *this; + } + + // we keep using the old allocator and not assign the new one, because we + // want to keep + // the memory available. when it is the same size. + if (o.empty()) { + if (0 == mMask) { + // nothing to do, we are empty too + return *this; + } + + // not empty: destroy what we have there + // clear also resets mInfo to 0, that's sometimes not necessary. + destroy(); + init(); + WHash::operator=(static_cast(o)); + WKeyEqual::operator=(static_cast(o)); + DataPool::operator=(static_cast(o)); + + return *this; + } + + // clean up old stuff + Destroyer::value>{} + .nodes(*this); + + if (mMask != o.mMask) { + // no luck: we don't have the same array size allocated, so we need to + // realloc. + if (0 != mMask) { + // only deallocate if we actually have data! + ROBIN_HOOD_LOG("std::free") + std::free(mKeyVals); + } + + auto const numElementsWithBuffer = calcNumElementsWithBuffer(o.mMask + 1); + auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer); + ROBIN_HOOD_LOG("std::malloc " << numBytesTotal << " = calcNumBytesTotal(" + << numElementsWithBuffer << ")") + mKeyVals = static_cast( + detail::assertNotNull(std::malloc(numBytesTotal))); + + // no need for calloc here because cloneData performs a memcpy. + mInfo = reinterpret_cast(mKeyVals + numElementsWithBuffer); + // sentinel is set in cloneData + } + WHash::operator=(static_cast(o)); + WKeyEqual::operator=(static_cast(o)); + DataPool::operator=(static_cast(o)); + mHashMultiplier = o.mHashMultiplier; + mNumElements = o.mNumElements; + mMask = o.mMask; + mMaxNumElementsAllowed = o.mMaxNumElementsAllowed; + mInfoInc = o.mInfoInc; + mInfoHashShift = o.mInfoHashShift; + cloneData(o); + + return *this; + } + + // Swaps everything between the two maps. + void swap(Table &o) { + ROBIN_HOOD_TRACE(this) + using std::swap; + swap(o, *this); + } + + // Clears all data, without resizing. + void clear() { + ROBIN_HOOD_TRACE(this) + if (empty()) { + // don't do anything! also important because we don't want to write to + // DummyInfoByte::b, even though we would just write 0 to it. + return; + } + + Destroyer::value>{} + .nodes(*this); + + auto const numElementsWithBuffer = calcNumElementsWithBuffer(mMask + 1); + // clear everything, then set the sentinel again + uint8_t const z = 0; + std::fill(mInfo, mInfo + calcNumBytesInfo(numElementsWithBuffer), z); + mInfo[numElementsWithBuffer] = 1; + + mInfoInc = InitialInfoInc; + mInfoHashShift = InitialInfoHashShift; + } + + // Destroys the map and all it's contents. + ~Table() { + ROBIN_HOOD_TRACE(this) + destroy(); + } + + // Checks if both tables contain the same entries. Order is irrelevant. + bool operator==(const Table &other) const { + ROBIN_HOOD_TRACE(this) + if (other.size() != size()) { + return false; + } + for (auto const &otherEntry : other) { + if (!has(otherEntry)) { + return false; + } + } + + return true; + } + + bool operator!=(const Table &other) const { + ROBIN_HOOD_TRACE(this) + return !operator==(other); + } + + template + typename std::enable_if::value, Q &>::type operator[]( + const key_type &key) { + ROBIN_HOOD_TRACE(this) + auto idxAndState = insertKeyPrepareEmptySpot(key); + switch (idxAndState.second) { + case InsertionState::key_found: + break; + + case InsertionState::new_node: + ::new (static_cast(&mKeyVals[idxAndState.first])) + Node(*this, std::piecewise_construct, std::forward_as_tuple(key), + std::forward_as_tuple()); + break; + + case InsertionState::overwrite_node: + mKeyVals[idxAndState.first] = + Node(*this, std::piecewise_construct, std::forward_as_tuple(key), + std::forward_as_tuple()); + break; + + case InsertionState::overflow_error: + throwOverflowError(); + } + + return mKeyVals[idxAndState.first].getSecond(); + } + + template + typename std::enable_if::value, Q &>::type operator[]( + key_type &&key) { + ROBIN_HOOD_TRACE(this) + auto idxAndState = insertKeyPrepareEmptySpot(key); + switch (idxAndState.second) { + case InsertionState::key_found: + break; + + case InsertionState::new_node: + ::new (static_cast(&mKeyVals[idxAndState.first])) Node( + *this, std::piecewise_construct, + std::forward_as_tuple(std::move(key)), std::forward_as_tuple()); + break; + + case InsertionState::overwrite_node: + mKeyVals[idxAndState.first] = Node( + *this, std::piecewise_construct, + std::forward_as_tuple(std::move(key)), std::forward_as_tuple()); + break; + + case InsertionState::overflow_error: + throwOverflowError(); + } + + return mKeyVals[idxAndState.first].getSecond(); + } + + template + void insert(Iter first, Iter last) { + for (; first != last; ++first) { + // value_type ctor needed because this might be called with std::pair's + insert(value_type(*first)); + } + } + + void insert(std::initializer_list ilist) { + for (auto &&vt : ilist) { + insert(std::move(vt)); + } + } + + template + std::pair emplace(Args &&... args) { + ROBIN_HOOD_TRACE(this) + Node n{*this, std::forward(args)...}; + auto idxAndState = insertKeyPrepareEmptySpot(getFirstConst(n)); + switch (idxAndState.second) { + case InsertionState::key_found: + n.destroy(*this); + break; + + case InsertionState::new_node: + ::new (static_cast(&mKeyVals[idxAndState.first])) + Node(*this, std::move(n)); + break; + + case InsertionState::overwrite_node: + mKeyVals[idxAndState.first] = std::move(n); + break; + + case InsertionState::overflow_error: + n.destroy(*this); + throwOverflowError(); + break; + } + + return std::make_pair( + iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first), + InsertionState::key_found != idxAndState.second); + } + + template + std::pair try_emplace(const key_type &key, Args &&... args) { + return try_emplace_impl(key, std::forward(args)...); + } + + template + std::pair try_emplace(key_type &&key, Args &&... args) { + return try_emplace_impl(std::move(key), std::forward(args)...); + } + + template + std::pair try_emplace(const_iterator hint, + const key_type &key, Args &&... args) { + (void)hint; + return try_emplace_impl(key, std::forward(args)...); + } + + template + std::pair try_emplace(const_iterator hint, key_type &&key, + Args &&... args) { + (void)hint; + return try_emplace_impl(std::move(key), std::forward(args)...); + } + + template + std::pair insert_or_assign(const key_type &key, + Mapped &&obj) { + return insertOrAssignImpl(key, std::forward(obj)); + } + + template + std::pair insert_or_assign(key_type &&key, Mapped &&obj) { + return insertOrAssignImpl(std::move(key), std::forward(obj)); + } + + template + std::pair insert_or_assign(const_iterator hint, + const key_type &key, + Mapped &&obj) { + (void)hint; + return insertOrAssignImpl(key, std::forward(obj)); + } + + template + std::pair insert_or_assign(const_iterator hint, + key_type &&key, Mapped &&obj) { + (void)hint; + return insertOrAssignImpl(std::move(key), std::forward(obj)); + } + + std::pair insert(const value_type &keyval) { + ROBIN_HOOD_TRACE(this) + return emplace(keyval); + } + + std::pair insert(value_type &&keyval) { + return emplace(std::move(keyval)); + } + + // Returns 1 if key is found, 0 otherwise. + size_t count(const key_type &key) const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + auto kv = mKeyVals + findIdx(key); + if (kv != reinterpret_cast_no_cast_align_warning(mInfo)) { + return 1; + } + return 0; + } + + template + // NOLINTNEXTLINE(modernize-use-nodiscard) + typename std::enable_if::type count( + const OtherKey &key) const { + ROBIN_HOOD_TRACE(this) + auto kv = mKeyVals + findIdx(key); + if (kv != reinterpret_cast_no_cast_align_warning(mInfo)) { + return 1; + } + return 0; + } + + bool contains(const key_type &key) const { // NOLINT(modernize-use-nodiscard) + return 1U == count(key); + } + + template + // NOLINTNEXTLINE(modernize-use-nodiscard) + typename std::enable_if::type contains( + const OtherKey &key) const { + return 1U == count(key); + } + + // Returns a reference to the value found for key. + // Throws std::out_of_range if element cannot be found + template + // NOLINTNEXTLINE(modernize-use-nodiscard) + typename std::enable_if::value, Q &>::type at( + key_type const &key) { + ROBIN_HOOD_TRACE(this) + auto kv = mKeyVals + findIdx(key); + if (kv == reinterpret_cast_no_cast_align_warning(mInfo)) { + doThrow("key not found"); + } + return kv->getSecond(); + } + + // Returns a reference to the value found for key. + // Throws std::out_of_range if element cannot be found + template + // NOLINTNEXTLINE(modernize-use-nodiscard) + typename std::enable_if::value, Q const &>::type at( + key_type const &key) const { + ROBIN_HOOD_TRACE(this) + auto kv = mKeyVals + findIdx(key); + if (kv == reinterpret_cast_no_cast_align_warning(mInfo)) { + doThrow("key not found"); + } + return kv->getSecond(); + } + + const_iterator find( + const key_type &key) const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + const size_t idx = findIdx(key); + return const_iterator{mKeyVals + idx, mInfo + idx}; + } + + template + const_iterator find(const OtherKey &key, + is_transparent_tag /*unused*/) const { + ROBIN_HOOD_TRACE(this) + const size_t idx = findIdx(key); + return const_iterator{mKeyVals + idx, mInfo + idx}; + } + + template + typename std::enable_if< + Self_::is_transparent, // NOLINT(modernize-use-nodiscard) + const_iterator>::type // NOLINT(modernize-use-nodiscard) + find(const OtherKey &key) const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + const size_t idx = findIdx(key); + return const_iterator{mKeyVals + idx, mInfo + idx}; + } + + iterator find(const key_type &key) { + ROBIN_HOOD_TRACE(this) + const size_t idx = findIdx(key); + return iterator{mKeyVals + idx, mInfo + idx}; + } + + template + iterator find(const OtherKey &key, is_transparent_tag /*unused*/) { + ROBIN_HOOD_TRACE(this) + const size_t idx = findIdx(key); + return iterator{mKeyVals + idx, mInfo + idx}; + } + + template + typename std::enable_if::type find( + const OtherKey &key) { + ROBIN_HOOD_TRACE(this) + const size_t idx = findIdx(key); + return iterator{mKeyVals + idx, mInfo + idx}; + } + + iterator begin() { + ROBIN_HOOD_TRACE(this) + if (empty()) { + return end(); + } + return iterator(mKeyVals, mInfo, fast_forward_tag{}); + } + const_iterator begin() const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return cbegin(); + } + const_iterator cbegin() const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + if (empty()) { + return cend(); + } + return const_iterator(mKeyVals, mInfo, fast_forward_tag{}); + } + + iterator end() { + ROBIN_HOOD_TRACE(this) + // no need to supply valid info pointer: end() must not be dereferenced, and + // only node + // pointer is compared. + return iterator{reinterpret_cast_no_cast_align_warning(mInfo), + nullptr}; + } + const_iterator end() const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return cend(); + } + const_iterator cend() const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return const_iterator{reinterpret_cast_no_cast_align_warning(mInfo), + nullptr}; + } + + iterator erase(const_iterator pos) { + ROBIN_HOOD_TRACE(this) + // its safe to perform const cast here + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) + return erase(iterator{const_cast(pos.mKeyVals), + const_cast(pos.mInfo)}); + } + + // Erases element at pos, returns iterator to the next element. + iterator erase(iterator pos) { + ROBIN_HOOD_TRACE(this) + // we assume that pos always points to a valid entry, and not end(). + auto const idx = static_cast(pos.mKeyVals - mKeyVals); + + shiftDown(idx); + --mNumElements; + + if (*pos.mInfo) { + // we've backward shifted, return this again + return pos; + } + + // no backward shift, return next element + return ++pos; + } + + size_t erase(const key_type &key) { + ROBIN_HOOD_TRACE(this) + size_t idx{}; + InfoType info{}; + keyToIdx(key, &idx, &info); + + // check while info matches with the source idx + do { + if (info == mInfo[idx] && + WKeyEqual::operator()(key, mKeyVals[idx].getFirst())) { + shiftDown(idx); + --mNumElements; + return 1; + } + next(&info, &idx); + } while (info <= mInfo[idx]); + + // nothing found to delete + return 0; + } + + // reserves space for the specified number of elements. Makes sure the old + // data fits. + // exactly the same as reserve(c). + void rehash(size_t c) { + // forces a reserve + reserve(c, true); + } + + // reserves space for the specified number of elements. Makes sure the old + // data fits. + // Exactly the same as rehash(c). Use rehash(0) to shrink to fit. + void reserve(size_t c) { + // reserve, but don't force rehash + reserve(c, false); + } + + // If possible reallocates the map to a smaller one. This frees the underlying + // table. + // Does not do anything if load_factor is too large for decreasing the table's + // size. + void compact() { + ROBIN_HOOD_TRACE(this) + auto newSize = InitialNumElements; + while (calcMaxNumElementsAllowed(newSize) < mNumElements && newSize != 0) { + newSize *= 2; + } + if (ROBIN_HOOD_UNLIKELY(newSize == 0)) { + throwOverflowError(); + } + + ROBIN_HOOD_LOG("newSize > mMask + 1: " << newSize << " > " << mMask + << " + 1") + + // only actually do anything when the new size is bigger than the old one. + // This prevents to + // continuously allocate for each reserve() call. + if (newSize < mMask + 1) { + rehashPowerOfTwo(newSize, true); + } + } + + size_type size() const noexcept { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return mNumElements; + } + + size_type max_size() const noexcept { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return static_cast(-1); + } + + ROBIN_HOOD(NODISCARD) bool empty() const noexcept { + ROBIN_HOOD_TRACE(this) + return 0 == mNumElements; + } + + float max_load_factor() const noexcept { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return MaxLoadFactor100 / 100.0F; + } + + // Average number of elements per bucket. Since we allow only 1 per bucket + float load_factor() const noexcept { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return static_cast(size()) / static_cast(mMask + 1); + } + + ROBIN_HOOD(NODISCARD) size_t mask() const noexcept { + ROBIN_HOOD_TRACE(this) + return mMask; + } + + ROBIN_HOOD(NODISCARD) + size_t calcMaxNumElementsAllowed(size_t maxElements) const noexcept { + if (ROBIN_HOOD_LIKELY(maxElements <= + (std::numeric_limits::max)() / 100)) { + return maxElements * MaxLoadFactor100 / 100; + } + + // we might be a bit inprecise, but since maxElements is quite large that + // doesn't matter + return (maxElements / 100) * MaxLoadFactor100; + } + + ROBIN_HOOD(NODISCARD) + size_t calcNumBytesInfo(size_t numElements) const noexcept { + // we add a uint64_t, which houses the sentinel (first byte) and padding so + // we can load + // 64bit types. + return numElements + sizeof(uint64_t); + } + + ROBIN_HOOD(NODISCARD) + size_t calcNumElementsWithBuffer(size_t numElements) const noexcept { + auto maxNumElementsAllowed = calcMaxNumElementsAllowed(numElements); + return numElements + + (std::min)(maxNumElementsAllowed, (static_cast(0xFF))); + } + + // calculation only allowed for 2^n values + ROBIN_HOOD(NODISCARD) size_t calcNumBytesTotal(size_t numElements) const { +#if ROBIN_HOOD(BITNESS) == 64 + return numElements * sizeof(Node) + calcNumBytesInfo(numElements); +#else + // make sure we're doing 64bit operations, so we are at least safe against + // 32bit overflows. + auto const ne = static_cast(numElements); + auto const s = static_cast(sizeof(Node)); + auto const infos = static_cast(calcNumBytesInfo(numElements)); + + auto const total64 = ne * s + infos; + auto const total = static_cast(total64); + + if (ROBIN_HOOD_UNLIKELY(static_cast(total) != total64)) { + throwOverflowError(); + } + return total; +#endif + } + + private: + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::value, bool>::type + has(const value_type &e) const { + ROBIN_HOOD_TRACE(this) + auto it = find(e.first); + return it != end() && it->second == e.second; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::value, bool>::type + has(const value_type &e) const { + ROBIN_HOOD_TRACE(this) + return find(e) != end(); + } + + void reserve(size_t c, bool forceRehash) { + ROBIN_HOOD_TRACE(this) + auto const minElementsAllowed = (std::max)(c, mNumElements); + auto newSize = InitialNumElements; + while (calcMaxNumElementsAllowed(newSize) < minElementsAllowed && + newSize != 0) { + newSize *= 2; + } + if (ROBIN_HOOD_UNLIKELY(newSize == 0)) { + throwOverflowError(); + } + + ROBIN_HOOD_LOG("newSize > mMask + 1: " << newSize << " > " << mMask + << " + 1") + + // only actually do anything when the new size is bigger than the old one. + // This prevents to + // continuously allocate for each reserve() call. + if (forceRehash || newSize > mMask + 1) { + rehashPowerOfTwo(newSize, false); + } + } + + // reserves space for at least the specified number of elements. + // only works if numBuckets if power of two + // True on success, false otherwise + void rehashPowerOfTwo(size_t numBuckets, bool forceFree) { + ROBIN_HOOD_TRACE(this) + + Node *const oldKeyVals = mKeyVals; + uint8_t const *const oldInfo = mInfo; + + const size_t oldMaxElementsWithBuffer = + calcNumElementsWithBuffer(mMask + 1); + + // resize operation: move stuff + initData(numBuckets); + if (oldMaxElementsWithBuffer > 1) { + for (size_t i = 0; i < oldMaxElementsWithBuffer; ++i) { + if (oldInfo[i] != 0) { + // might throw an exception, which is really bad since we are in the + // middle of + // moving stuff. + insert_move(std::move(oldKeyVals[i])); + // destroy the node but DON'T destroy the data. + oldKeyVals[i].~Node(); + } + } + + // this check is not necessary as it's guarded by the previous if, but it + // helps + // silence g++'s overeager "attempt to free a non-heap object 'map' + // [-Werror=free-nonheap-object]" warning. + if (oldKeyVals != + reinterpret_cast_no_cast_align_warning(&mMask)) { + // don't destroy old data: put it into the pool instead + if (forceFree) { + std::free(oldKeyVals); + } else { + DataPool::addOrFree(oldKeyVals, + calcNumBytesTotal(oldMaxElementsWithBuffer)); + } + } + } + } + + ROBIN_HOOD(NOINLINE) void throwOverflowError() const { +#if ROBIN_HOOD(HAS_EXCEPTIONS) + throw std::overflow_error("robin_hood::map overflow"); +#else + abort(); +#endif + } + + template + std::pair try_emplace_impl(OtherKey &&key, Args &&... args) { + ROBIN_HOOD_TRACE(this) + auto idxAndState = insertKeyPrepareEmptySpot(key); + switch (idxAndState.second) { + case InsertionState::key_found: + break; + + case InsertionState::new_node: + ::new (static_cast(&mKeyVals[idxAndState.first])) + Node(*this, std::piecewise_construct, + std::forward_as_tuple(std::forward(key)), + std::forward_as_tuple(std::forward(args)...)); + break; + + case InsertionState::overwrite_node: + mKeyVals[idxAndState.first] = + Node(*this, std::piecewise_construct, + std::forward_as_tuple(std::forward(key)), + std::forward_as_tuple(std::forward(args)...)); + break; + + case InsertionState::overflow_error: + throwOverflowError(); + break; + } + + return std::make_pair( + iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first), + InsertionState::key_found != idxAndState.second); + } + + template + std::pair insertOrAssignImpl(OtherKey &&key, Mapped &&obj) { + ROBIN_HOOD_TRACE(this) + auto idxAndState = insertKeyPrepareEmptySpot(key); + switch (idxAndState.second) { + case InsertionState::key_found: + mKeyVals[idxAndState.first].getSecond() = std::forward(obj); + break; + + case InsertionState::new_node: + ::new (static_cast(&mKeyVals[idxAndState.first])) + Node(*this, std::piecewise_construct, + std::forward_as_tuple(std::forward(key)), + std::forward_as_tuple(std::forward(obj))); + break; + + case InsertionState::overwrite_node: + mKeyVals[idxAndState.first] = + Node(*this, std::piecewise_construct, + std::forward_as_tuple(std::forward(key)), + std::forward_as_tuple(std::forward(obj))); + break; + + case InsertionState::overflow_error: + throwOverflowError(); + break; + } + + return std::make_pair( + iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first), + InsertionState::key_found != idxAndState.second); + } + + void initData(size_t max_elements) { + mNumElements = 0; + mMask = max_elements - 1; + mMaxNumElementsAllowed = calcMaxNumElementsAllowed(max_elements); + + auto const numElementsWithBuffer = calcNumElementsWithBuffer(max_elements); + + // calloc also zeroes everything + auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer); + ROBIN_HOOD_LOG("std::calloc " << numBytesTotal << " = calcNumBytesTotal(" + << numElementsWithBuffer << ")") + mKeyVals = reinterpret_cast( + detail::assertNotNull(std::calloc(1, numBytesTotal))); + mInfo = reinterpret_cast(mKeyVals + numElementsWithBuffer); + + // set sentinel + mInfo[numElementsWithBuffer] = 1; + + mInfoInc = InitialInfoInc; + mInfoHashShift = InitialInfoHashShift; + } + + enum class InsertionState { + overflow_error, + key_found, + new_node, + overwrite_node + }; + + // Finds key, and if not already present prepares a spot where to pot the key + // & value. + // This potentially shifts nodes out of the way, updates mInfo and number of + // inserted + // elements, so the only operation left to do is create/assign a new node at + // that spot. + template + std::pair insertKeyPrepareEmptySpot(OtherKey &&key) { + for (int i = 0; i < 256; ++i) { + size_t idx{}; + InfoType info{}; + keyToIdx(key, &idx, &info); + nextWhileLess(&info, &idx); + + // while we potentially have a match + while (info == mInfo[idx]) { + if (WKeyEqual::operator()(key, mKeyVals[idx].getFirst())) { + // key already exists, do NOT insert. + // see http://en.cppreference.com/w/cpp/container/unordered_map/insert + return std::make_pair(idx, InsertionState::key_found); + } + next(&info, &idx); + } + + // unlikely that this evaluates to true + if (ROBIN_HOOD_UNLIKELY(mNumElements >= mMaxNumElementsAllowed)) { + if (!increase_size()) { + return std::make_pair(size_t(0), InsertionState::overflow_error); + } + continue; + } + + // key not found, so we are now exactly where we want to insert it. + auto const insertion_idx = idx; + auto const insertion_info = info; + if (ROBIN_HOOD_UNLIKELY(insertion_info + mInfoInc > 0xFF)) { + mMaxNumElementsAllowed = 0; + } + + // find an empty spot + while (0 != mInfo[idx]) { + next(&info, &idx); + } + + if (idx != insertion_idx) { + shiftUp(idx, insertion_idx); + } + // put at empty spot + mInfo[insertion_idx] = static_cast(insertion_info); + ++mNumElements; + return std::make_pair( + insertion_idx, idx == insertion_idx ? InsertionState::new_node + : InsertionState::overwrite_node); + } + + // enough attempts failed, so finally give up. + return std::make_pair(size_t(0), InsertionState::overflow_error); + } + + bool try_increase_info() { + ROBIN_HOOD_LOG("mInfoInc=" << mInfoInc << ", numElements=" << mNumElements + << ", maxNumElementsAllowed=" + << calcMaxNumElementsAllowed(mMask + 1)) + if (mInfoInc <= 2) { + // need to be > 2 so that shift works (otherwise undefined behavior!) + return false; + } + // we got space left, try to make info smaller + mInfoInc = static_cast(mInfoInc >> 1U); + + // remove one bit of the hash, leaving more space for the distance info. + // This is extremely fast because we can operate on 8 bytes at once. + ++mInfoHashShift; + auto const numElementsWithBuffer = calcNumElementsWithBuffer(mMask + 1); + + for (size_t i = 0; i < numElementsWithBuffer; i += 8) { + auto val = unaligned_load(mInfo + i); + val = (val >> 1U) & UINT64_C(0x7f7f7f7f7f7f7f7f); + std::memcpy(mInfo + i, &val, sizeof(val)); + } + // update sentinel, which might have been cleared out! + mInfo[numElementsWithBuffer] = 1; + + mMaxNumElementsAllowed = calcMaxNumElementsAllowed(mMask + 1); + return true; + } + + // True if resize was possible, false otherwise + bool increase_size() { + // nothing allocated yet? just allocate InitialNumElements + if (0 == mMask) { + initData(InitialNumElements); + return true; + } + + auto const maxNumElementsAllowed = calcMaxNumElementsAllowed(mMask + 1); + if (mNumElements < maxNumElementsAllowed && try_increase_info()) { + return true; + } + + ROBIN_HOOD_LOG("mNumElements=" + << mNumElements + << ", maxNumElementsAllowed=" << maxNumElementsAllowed + << ", load=" << (static_cast(mNumElements) * 100.0 / + (static_cast(mMask) + 1))) + + nextHashMultiplier(); + if (mNumElements * 2 < calcMaxNumElementsAllowed(mMask + 1)) { + // we have to resize, even though there would still be plenty of space + // left! + // Try to rehash instead. Delete freed memory so we don't steadyily + // increase mem in case + // we have to rehash a few times + rehashPowerOfTwo(mMask + 1, true); + } else { + // Each resize use a different hash so we don't so easily overflow. + // Make sure we only have odd numbers, so that the multiplication is + // reversible! + rehashPowerOfTwo((mMask + 1) * 2, false); + } + return true; + } + + void nextHashMultiplier() { + // adding an *even* number, so that the multiplier will always stay odd. + // This is necessary + // so that the hash stays a mixing function (and thus doesn't have any + // information loss). + mHashMultiplier += UINT64_C(0xc4ceb9fe1a85ec54); + } + + void destroy() { + if (0 == mMask) { + // don't deallocate! + return; + } + + Destroyer::value>{} + .nodesDoNotDeallocate(*this); + + // This protection against not deleting mMask shouldn't be needed as it's + // sufficiently + // protected with the 0==mMask check, but I have this anyways because g++ 7 + // otherwise + // reports a compile error: attempt to free a non-heap object 'fm' + // [-Werror=free-nonheap-object] + if (mKeyVals != reinterpret_cast_no_cast_align_warning(&mMask)) { + ROBIN_HOOD_LOG("std::free") + std::free(mKeyVals); + } + } + + void init() noexcept { + mKeyVals = reinterpret_cast_no_cast_align_warning(&mMask); + mInfo = reinterpret_cast(&mMask); + mNumElements = 0; + mMask = 0; + mMaxNumElementsAllowed = 0; + mInfoInc = InitialInfoInc; + mInfoHashShift = InitialInfoHashShift; + } + + // members are sorted so no padding occurs + uint64_t mHashMultiplier = UINT64_C(0xc4ceb9fe1a85ec53); // 8 byte 8 + Node *mKeyVals = + reinterpret_cast_no_cast_align_warning(&mMask); // 8 byte 16 + uint8_t *mInfo = reinterpret_cast(&mMask); // 8 byte 24 + size_t mNumElements = 0; // 8 byte 32 + size_t mMask = 0; // 8 byte 40 + size_t mMaxNumElementsAllowed = 0; // 8 byte 48 + InfoType mInfoInc = InitialInfoInc; // 4 byte 52 + InfoType mInfoHashShift = InitialInfoHashShift; // 4 byte 56 + // 16 byte 56 if NodeAllocator +}; + +} // namespace detail + +// map + +template , + typename KeyEqual = std::equal_to, size_t MaxLoadFactor100 = 80> +using unordered_flat_map = + detail::Table; + +template , + typename KeyEqual = std::equal_to, size_t MaxLoadFactor100 = 80> +using unordered_node_map = + detail::Table; + +template , + typename KeyEqual = std::equal_to, size_t MaxLoadFactor100 = 80> +using unordered_map = detail::Table< + sizeof(robin_hood::pair) <= sizeof(size_t) * 6 && + std::is_nothrow_move_constructible>::value && + std::is_nothrow_move_assignable>::value, + MaxLoadFactor100, Key, T, Hash, KeyEqual>; + +// set + +template , + typename KeyEqual = std::equal_to, size_t MaxLoadFactor100 = 80> +using unordered_flat_set = + detail::Table; + +template , + typename KeyEqual = std::equal_to, size_t MaxLoadFactor100 = 80> +using unordered_node_set = + detail::Table; + +template , + typename KeyEqual = std::equal_to, size_t MaxLoadFactor100 = 80> +using unordered_set = + detail::Table::value && + std::is_nothrow_move_assignable::value, + MaxLoadFactor100, Key, void, Hash, KeyEqual>; + +} // namespace robin_hood + +#endif diff --git a/paddle/fluid/extension/include/ext_dispatch.h b/paddle/fluid/extension/include/ext_dispatch.h index eed736046496f3af799c82f4f9236a3fc88450fc..9b3e199708adc93356c214df3be217f67d2e8949 100644 --- a/paddle/fluid/extension/include/ext_dispatch.h +++ b/paddle/fluid/extension/include/ext_dispatch.h @@ -47,6 +47,22 @@ namespace paddle { } \ }() +#define PD_DISPATCH_FLOATING_AND_HALF_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT16, paddle::float16, \ + __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `", \ + ::paddle::ToString(__dtype__), "`"); \ + } \ + }() + ///////// Integral Dispatch Marco /////////// #define PD_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \ @@ -68,6 +84,22 @@ namespace paddle { } \ }() +///////// Complex Dispatch Marco /////////// + +#define PD_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64, \ + ::paddle::complex64, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128, \ + ::paddle::complex128, __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `" + \ + ::paddle::ToString(__dtype__) + "`"); \ + } \ + }() + ///////// Floating and Integral Dispatch Marco /////////// #define PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, ...) \ @@ -93,6 +125,55 @@ namespace paddle { } \ }() +///////// Floating and Complex Dispatch Marco /////////// + +#define PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64, \ + ::paddle::complex64, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128, \ + ::paddle::complex128, __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `" + \ + ::paddle::ToString(__dtype__) + "`"); \ + } \ + }() + +///////// Floating, Integral and Complex Dispatch Marco /////////// + +#define PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT64, int64_t, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT8, int8_t, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::UINT8, uint8_t, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT16, int16_t, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64, \ + ::paddle::complex64, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128, \ + ::paddle::complex128, __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `" + \ + ::paddle::ToString(__dtype__) + "`"); \ + } \ + }() + // TODO(chenweihang): Add more Marcos in the future if needed } // namespace paddle diff --git a/paddle/fluid/extension/include/ext_dtype.h b/paddle/fluid/extension/include/ext_dtype.h index 46c4bac23606449dd2ba68b25818fd58d88f6204..3890631a6f8a9e99948e32cdd3cb8c1e00c2de75 100644 --- a/paddle/fluid/extension/include/ext_dtype.h +++ b/paddle/fluid/extension/include/ext_dtype.h @@ -16,10 +16,17 @@ limitations under the License. */ #include #include +#include "complex128.h" // NOLINT +#include "complex64.h" // NOLINT #include "ext_exception.h" // NOLINT +#include "float16.h" // NOLINT namespace paddle { +using complex64 = paddle::platform::complex64; +using complex128 = paddle::platform::complex128; +using float16 = paddle::platform::float16; + enum class DataType { BOOL, INT8, @@ -27,8 +34,11 @@ enum class DataType { INT16, INT32, INT64, + FLOAT16, FLOAT32, FLOAT64, + COMPLEX64, + COMPLEX128, // TODO(JiabinYang) support more data types if needed. }; @@ -46,24 +56,33 @@ inline std::string ToString(DataType dtype) { return "int32_t"; case DataType::INT64: return "int64_t"; + case DataType::FLOAT16: + return "float16"; case DataType::FLOAT32: return "float"; case DataType::FLOAT64: return "double"; + case DataType::COMPLEX64: + return "complex64"; + case DataType::COMPLEX128: + return "complex128"; default: PD_THROW("Unsupported paddle enum data type."); } } -#define PD_FOR_EACH_DATA_TYPE(_) \ - _(bool, DataType::BOOL) \ - _(int8_t, DataType::INT8) \ - _(uint8_t, DataType::UINT8) \ - _(int16_t, DataType::INT16) \ - _(int, DataType::INT32) \ - _(int64_t, DataType::INT64) \ - _(float, DataType::FLOAT32) \ - _(double, DataType::FLOAT64) +#define PD_FOR_EACH_DATA_TYPE(_) \ + _(bool, DataType::BOOL) \ + _(int8_t, DataType::INT8) \ + _(uint8_t, DataType::UINT8) \ + _(int16_t, DataType::INT16) \ + _(int, DataType::INT32) \ + _(int64_t, DataType::INT64) \ + _(float16, DataType::FLOAT16) \ + _(float, DataType::FLOAT32) \ + _(double, DataType::FLOAT64) \ + _(complex64, DataType::COMPLEX64) \ + _(complex128, DataType::COMPLEX128) template struct DataTypeToCPPType; diff --git a/paddle/fluid/extension/include/ext_op_meta_info.h b/paddle/fluid/extension/include/ext_op_meta_info.h index a3b9a4c491033db80f4c10829c0ee61b50387b25..c400164c7543da9878d0fb51a6f239dfaff5beff 100644 --- a/paddle/fluid/extension/include/ext_op_meta_info.h +++ b/paddle/fluid/extension/include/ext_op_meta_info.h @@ -56,38 +56,55 @@ using Tensor = paddle::Tensor; ///////////////// Util Define and Function //////////////// -inline std::string Grad(const std::string& var_name) { +constexpr char kGradTensorSuffix[] = "@GRAD"; +constexpr char kTensorVectorSuffix[] = "@VECTOR"; + +// Used for Construct Grad Tensor name +inline std::string Grad(const std::string& t_name) { + std::string result; + result.reserve(t_name.size() + 5U); + result += t_name; + result += kGradTensorSuffix; + return result; +} + +// Used for Construct std::vector name +inline std::string Vec(const std::string& t_name) { std::string result; - result.reserve(var_name.size() + 5U); - result += var_name; - result += "@GRAD"; + result.reserve(t_name.size() + 7U); + result += t_name; + result += kTensorVectorSuffix; return result; } ////////////////////// Kernel Function (PD_KERNEL) //////////////////////// // Record Op kernel core function -using KernelFunc = std::vector (*)(std::vector inputs, - std::vector attrs); - -#define PD_SPECIALIZE_ComputeCallHelper(attr_type) \ - template \ - struct ComputeCallHelper { \ - template \ - static Return Compute(std::vector inputs, \ - std::vector attrs, \ - const PreviousArgs&... pargs) { \ - try { \ - attr_type arg = boost::any_cast(attrs[attr_idx]); \ - return ComputeCallHelper::template Compute( \ - inputs, attrs, pargs..., arg); \ - } catch (boost::bad_any_cast&) { \ - PD_THROW( \ - "Attribute cast error in custom operator. Expected " #attr_type \ - " value."); \ - } \ - } \ +using KernelFunc = + std::vector (*)(const std::vector& inputs, + const std::vector>& vec_inputs, + const std::vector& attrs); + +#define PD_SPECIALIZE_ComputeCallHelper(attr_type) \ + template \ + struct ComputeCallHelper { \ + template \ + static Return Compute(const std::vector& inputs, \ + const std::vector>& vec_inputs, \ + const std::vector& attrs, \ + const PreviousArgs&... pargs) { \ + try { \ + attr_type arg = boost::any_cast(attrs[attr_idx]); \ + return ComputeCallHelper::template Compute< \ + in_idx, vec_in_idx, attr_idx + 1>(inputs, vec_inputs, attrs, \ + pargs..., arg); \ + } catch (boost::bad_any_cast&) { \ + PD_THROW( \ + "Attribute cast error in custom operator. Expected " #attr_type \ + " value."); \ + } \ + } \ } template @@ -98,31 +115,64 @@ struct KernelFuncImpl; template struct KernelFuncImpl { - static Return Compute(std::vector inputs, - std::vector attrs) { - return ComputeCallHelper>::template Compute<0, 0>( - inputs, attrs); + static Return Compute(const std::vector& inputs, + const std::vector>& vec_inputs, + const std::vector& attrs) { + return ComputeCallHelper>::template Compute<0, 0, 0>( + inputs, vec_inputs, attrs); } private: template struct ComputeCallHelper; - // for Tensor input template struct ComputeCallHelper { - template - static Return Compute(std::vector inputs, - std::vector attrs, + template + static Return Compute(const std::vector& inputs, + const std::vector>& vec_inputs, + const std::vector& attrs, const PreviousArgs&... pargs) { - static_assert(attr_idx == 0, - "Input tensor should appear before attributes."); const Tensor& arg = inputs[in_idx]; - return ComputeCallHelper::template Compute( - inputs, attrs, pargs..., arg); + return ComputeCallHelper::template Compute( + inputs, vec_inputs, attrs, pargs..., arg); } }; + template + struct ComputeCallHelper&, Tail...> { + template + static Return Compute(const std::vector& inputs, + const std::vector>& vec_inputs, + const std::vector& attrs, + const PreviousArgs&... pargs) { + const std::vector& arg = vec_inputs[vec_in_idx]; + return ComputeCallHelper::template Compute< + in_idx, vec_in_idx + 1, attr_idx>(inputs, vec_inputs, attrs, pargs..., + arg); + } + }; + + PD_SPECIALIZE_ComputeCallHelper(const bool&); + PD_SPECIALIZE_ComputeCallHelper(const int&); + PD_SPECIALIZE_ComputeCallHelper(const float&); + PD_SPECIALIZE_ComputeCallHelper(const int64_t&); + PD_SPECIALIZE_ComputeCallHelper(const std::string&); + PD_SPECIALIZE_ComputeCallHelper(const std::vector&); + PD_SPECIALIZE_ComputeCallHelper(const std::vector&); + PD_SPECIALIZE_ComputeCallHelper(const std::vector&); + PD_SPECIALIZE_ComputeCallHelper(const std::vector&); + // TODO(chenweihang): support other attribute type if needed. + // Why not support other attribute type here? + // - boost::blank, std::vector and std::vector + // are not used in op + // - BlockDesc* and std::vector are used in framework + + // NOTE(chenweihang): Used to be compatible with the 2.0.1 released + // interface, and will be deprecated in the future PD_SPECIALIZE_ComputeCallHelper(bool); PD_SPECIALIZE_ComputeCallHelper(int); PD_SPECIALIZE_ComputeCallHelper(float); @@ -132,17 +182,15 @@ struct KernelFuncImpl { PD_SPECIALIZE_ComputeCallHelper(std::vector); PD_SPECIALIZE_ComputeCallHelper(std::vector); PD_SPECIALIZE_ComputeCallHelper(std::vector); - // TODO(chenweihang): support other attribute type if needed. - // Why not support other attribute type here? - // - boost::blank, std::vector and std::vector - // are not used in op - // - BlockDesc* and std::vector are used in framework + // end: base template template struct ComputeCallHelper> { - template - static Return Compute(std::vector inputs, - std::vector attrs, const Args&... args) { + template + static Return Compute(const std::vector& inputs, + const std::vector>& vec_inputs, + const std::vector& attrs, + const Args&... args) { return impl_fn(args...); } }; @@ -155,40 +203,118 @@ struct KernelFuncImpl { // Record Op infershape core function using InferShapeFunc = std::vector> (*)( - std::vector> input_shapes); + const std::vector>& input_shapes, + const std::vector>>& vec_input_shapes, + const std::vector& attrs); + +#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(input_type) \ + template \ + struct InferShapeCallHelper { \ + template \ + static Return InferShape( \ + const std::vector>& input_shapes, \ + const std::vector>>& \ + vec_input_shapes, \ + const std::vector& attrs, const PreviousArgs&... pargs) { \ + input_type arg = input_shapes[in_idx]; \ + return InferShapeCallHelper::template InferShape< \ + in_idx + 1, vec_in_idx, attr_idx>(input_shapes, vec_input_shapes, \ + attrs, pargs..., arg); \ + } \ + } + +#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES(input_type) \ + template \ + struct InferShapeCallHelper { \ + template \ + static Return InferShape( \ + const std::vector>& input_shapes, \ + const std::vector>>& \ + vec_input_shapes, \ + const std::vector& attrs, const PreviousArgs&... pargs) { \ + input_type arg = vec_input_shapes[vec_in_idx]; \ + return InferShapeCallHelper::template InferShape< \ + in_idx, vec_in_idx + 1, attr_idx>(input_shapes, vec_input_shapes, \ + attrs, pargs..., arg); \ + } \ + } + +#define PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(attr_type) \ + template \ + struct InferShapeCallHelper { \ + template \ + static Return InferShape( \ + const std::vector>& input_shapes, \ + const std::vector>>& \ + vec_input_shapes, \ + const std::vector& attrs, const PreviousArgs&... pargs) { \ + try { \ + attr_type arg = boost::any_cast(attrs[attr_idx]); \ + return InferShapeCallHelper::template InferShape< \ + in_idx, vec_in_idx, attr_idx + 1>(input_shapes, vec_input_shapes, \ + attrs, pargs..., arg); \ + } catch (boost::bad_any_cast&) { \ + PD_THROW( \ + "Attribute cast error in custom operator InferShapeFn. " \ + "Expected " #attr_type \ + " value. InferShapeFn's attribute list must be exactly same as " \ + "Forward " \ + "KernelFn's attribute list except std::vector " \ + "attribute."); \ + } \ + } \ + } template struct InferShapeFuncImpl; template struct InferShapeFuncImpl { - static Return InferShape(std::vector> input_shapes) { - return InferShapeCallHelper>::template InferShape<0>( - input_shapes); + static Return InferShape( + const std::vector>& input_shapes, + const std::vector>>& vec_input_shapes, + const std::vector& attrs) { + return InferShapeCallHelper>::template InferShape< + 0, 0, 0>(input_shapes, vec_input_shapes, attrs); } private: template struct InferShapeCallHelper; - // only one type input: std::vector - template - struct InferShapeCallHelper, Tail...> { - template - static Return InferShape(std::vector> input_shapes, - const PreviousArgs&... pargs) { - std::vector arg = input_shapes[in_idx]; - return InferShapeCallHelper::template InferShape( - input_shapes, pargs..., arg); - } - }; + PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(const std::vector&); + PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES( + const std::vector>&); + + // NOTE(chenweihang): Used to be compatible with the 2.0.1 released + // interface, and will be deprecated in the future + PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(std::vector); + PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES( + std::vector>); + + PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const bool&); + PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const int&); + PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const float&); + PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const int64_t&); + PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const std::string&); + PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const std::vector&); + PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const std::vector&); + PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const std::vector&); + // NOTE(chenweihang): InferShape can't support std::vector attr type, + // because the input type is std::vector, only can use one rule to + // parse std::vector parameter // end: base template template struct InferShapeCallHelper> { - template - static Return InferShape(std::vector> input_shapes, - const Args&... args) { + template + static Return InferShape( + const std::vector>& input_shapes, + const std::vector>>& vec_input_shapes, + const std::vector& attrs, const Args&... args) { return impl_fn(args...); } }; @@ -200,41 +326,73 @@ struct InferShapeFuncImpl { /////////////// InferDataType Function (PD_INFER_DTYPE) /////////////// // Record Op Infer dtype core function -using InferDtypeFunc = - std::vector (*)(std::vector input_dtypes); +using InferDtypeFunc = std::vector (*)( + const std::vector& input_dtypes, + const std::vector>& vec_input_dtypes); + +#define PD_SPECIALIZE_InferDtypeCallHelper_TO_DTYPE(input_type) \ + template \ + struct InferDtypeCallHelper { \ + template \ + static Return InferDtype( \ + const std::vector& input_dtypes, \ + const std::vector>& vec_input_dtypes, \ + const PreviousArgs&... pargs) { \ + input_type arg = input_dtypes[in_idx]; \ + return InferDtypeCallHelper::template InferDtype( \ + input_dtypes, vec_input_dtypes, pargs..., arg); \ + } \ + } + +#define PD_SPECIALIZE_InferDtypeCallHelper_FOR_DTYPES(input_type) \ + template \ + struct InferDtypeCallHelper { \ + template \ + static Return InferDtype( \ + const std::vector& input_dtypes, \ + const std::vector>& vec_input_dtypes, \ + const PreviousArgs&... pargs) { \ + input_type arg = vec_input_dtypes[vec_in_idx]; \ + return InferDtypeCallHelper::template InferDtype< \ + in_idx, vec_in_idx + 1>(input_dtypes, vec_input_dtypes, pargs..., \ + arg); \ + } \ + } template struct InferDtypeFuncImpl; template struct InferDtypeFuncImpl { - static Return InferDtype(std::vector input_dtypes) { - return InferDtypeCallHelper>::template InferDtype<0>( - input_dtypes); + static Return InferDtype( + const std::vector& input_dtypes, + const std::vector>& vec_input_dtypes) { + return InferDtypeCallHelper>::template InferDtype<0, + 0>( + input_dtypes, vec_input_dtypes); } private: template struct InferDtypeCallHelper; - // Only one type input now: DataType - template - struct InferDtypeCallHelper { - template - static Return InferDtype(std::vector input_dtypes, - const PreviousArgs&... pargs) { - DataType arg = input_dtypes[in_idx]; - return InferDtypeCallHelper::template InferDtype( - input_dtypes, pargs..., arg); - } - }; + PD_SPECIALIZE_InferDtypeCallHelper_TO_DTYPE(const DataType&); + PD_SPECIALIZE_InferDtypeCallHelper_FOR_DTYPES(const std::vector&); + + // NOTE(chenweihang): Used to be compatible with the 2.0.1 released + // interface, and will be deprecated in the future + PD_SPECIALIZE_InferDtypeCallHelper_TO_DTYPE(DataType); + PD_SPECIALIZE_InferDtypeCallHelper_FOR_DTYPES(std::vector); // end: base template template struct InferDtypeCallHelper> { - template - static Return InferDtype(std::vector input_dtypes, - const Args&... args) { + template + static Return InferDtype( + const std::vector& input_dtypes, + const std::vector>& vec_input_dtypes, + const Args&... args) { return impl_fn(args...); } }; diff --git a/paddle/fluid/extension/include/ext_tensor.h b/paddle/fluid/extension/include/ext_tensor.h index be492a6d5535d17df579ec4fec8dd76d266a3029..fa91490e6cd8af7c3a4ff4b2b3013b519c59aa2a 100644 --- a/paddle/fluid/extension/include/ext_tensor.h +++ b/paddle/fluid/extension/include/ext_tensor.h @@ -52,6 +52,9 @@ class PD_DLL_DECL Tensor { /// \brief Construct a Tensor on target Place for CustomOp. /// Generally it's only used for user to create Tensor. explicit Tensor(const PlaceType& place); + /// \brief Construct a Tensor on target Place with shape for CustomOp. + /// Generally it's only used for user to create Tensor. + Tensor(const PlaceType& place, const std::vector& shape); /// \brief Reset the shape of the tensor. /// Generally it's only used for the input tensor. /// Reshape must be called before calling @@ -110,6 +113,9 @@ class PD_DLL_DECL Tensor { /// \brief Cast datatype from one to another Tensor cast(const DataType& target_type) const; + /// \brief Check Tensor is initialized + bool is_initialized() const; + #ifdef PADDLE_WITH_CUDA /// \bref Get current stream of Tensor cudaStream_t stream() const; diff --git a/paddle/fluid/extension/src/ext_tensor.cc b/paddle/fluid/extension/src/ext_tensor.cc index 4434a3bf5941ffdd668c31c737493162e67bea5e..8b2f7cc5bf13c99b80cd365f5c449f3d3b68bdc5 100644 --- a/paddle/fluid/extension/src/ext_tensor.cc +++ b/paddle/fluid/extension/src/ext_tensor.cc @@ -13,11 +13,16 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/extension/include/ext_tensor.h" + #include + #include "paddle/fluid/framework/custom_tensor_utils.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/complex128.h" +#include "paddle/fluid/platform/complex64.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/transform.h" namespace paddle { @@ -97,13 +102,23 @@ void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc, void Tensor::reshape(const std::vector &shape) { GET_CASTED_TENSOR - tensor->Resize(framework::make_ddim(shape)); + auto new_dim = framework::make_ddim(shape); + tensor->Resize(new_dim); } Tensor::Tensor(const PlaceType &place) : tensor_(std::make_shared()), place_(place), stream_(StreamWrapper()) {} + +Tensor::Tensor(const PlaceType &place, const std::vector &shape) + : tensor_(std::make_shared()), + place_(place), + stream_(StreamWrapper()) { + GET_CASTED_TENSOR + tensor->Resize(framework::make_ddim(shape)); +} + template T *Tensor::mutable_data(const PlaceType &place) { place_ = place; @@ -162,6 +177,12 @@ DataType Tensor::type() const { return DataType::FLOAT64; } else if (type == framework::proto::VarType::BOOL) { return DataType::BOOL; + } else if (type == framework::proto::VarType::COMPLEX64) { + return DataType::COMPLEX64; + } else if (type == framework::proto::VarType::COMPLEX128) { + return DataType::COMPLEX128; + } else if (type == framework::proto::VarType::FP16) { + return DataType::FLOAT16; } // TODO(JiabinYang) Support more dtype here return DataType::FLOAT32; @@ -217,6 +238,12 @@ template PD_DLL_DECL Tensor Tensor::copy_to(const PlaceType &target_place) const; template PD_DLL_DECL Tensor Tensor::copy_to(const PlaceType &target_place) const; +template PD_DLL_DECL Tensor Tensor::copy_to( + const PlaceType &target_place) const; +template PD_DLL_DECL Tensor Tensor::copy_to( + const PlaceType &target_place) const; +template PD_DLL_DECL Tensor +Tensor::copy_to(const PlaceType &target_place) const; template PD_DLL_DECL float *Tensor::data() const; template PD_DLL_DECL double *Tensor::data() const; @@ -226,6 +253,12 @@ template PD_DLL_DECL uint8_t *Tensor::data() const; template PD_DLL_DECL int8_t *Tensor::data() const; template PD_DLL_DECL int16_t *Tensor::data() const; template PD_DLL_DECL bool *Tensor::data() const; +template PD_DLL_DECL paddle::platform::complex64 * +Tensor::data() const; +template PD_DLL_DECL paddle::platform::complex128 * +Tensor::data() const; +template PD_DLL_DECL paddle::platform::float16 * +Tensor::data() const; template PD_DLL_DECL float *Tensor::mutable_data(); template PD_DLL_DECL double *Tensor::mutable_data(); @@ -235,6 +268,12 @@ template PD_DLL_DECL uint8_t *Tensor::mutable_data(); template PD_DLL_DECL int8_t *Tensor::mutable_data(); template PD_DLL_DECL int16_t *Tensor::mutable_data(); template PD_DLL_DECL bool *Tensor::mutable_data(); +template PD_DLL_DECL paddle::platform::complex64 * +Tensor::mutable_data(); +template PD_DLL_DECL paddle::platform::complex128 * +Tensor::mutable_data(); +template PD_DLL_DECL paddle::platform::float16 * +Tensor::mutable_data(); template PD_DLL_DECL float *Tensor::mutable_data(const PlaceType &place); template PD_DLL_DECL double *Tensor::mutable_data( @@ -250,6 +289,12 @@ template PD_DLL_DECL int8_t *Tensor::mutable_data( template PD_DLL_DECL int16_t *Tensor::mutable_data( const PlaceType &place); template PD_DLL_DECL bool *Tensor::mutable_data(const PlaceType &place); +template PD_DLL_DECL paddle::platform::complex64 * +Tensor::mutable_data(const PlaceType &place); +template PD_DLL_DECL paddle::platform::complex128 * +Tensor::mutable_data(const PlaceType &place); +template PD_DLL_DECL paddle::platform::float16 * +Tensor::mutable_data(const PlaceType &place); std::vector Tensor::shape() const { GET_CASTED_TENSOR @@ -310,6 +355,21 @@ Tensor Tensor::cast(const DataType &target_type) const { framework::VisitDataType( dst_type, CastDataType(*tensor, rlt_tensor_, ctx)); break; + case framework::proto::VarType::COMPLEX64: + framework::VisitDataType( + dst_type, + CastDataType(*tensor, rlt_tensor_, ctx)); + break; + case framework::proto::VarType::COMPLEX128: + framework::VisitDataType(dst_type, + CastDataType( + *tensor, rlt_tensor_, ctx)); + break; + case framework::proto::VarType::FP16: + framework::VisitDataType( + dst_type, + CastDataType(*tensor, rlt_tensor_, ctx)); + break; // TODO(JiabinYang) Support more dtype here default: PADDLE_THROW(platform::errors::Unimplemented( @@ -324,6 +384,15 @@ int64_t Tensor::size() const { return tensor->numel(); } +bool Tensor::is_initialized() const { + GET_CASTED_TENSOR; + if (tensor->IsInitialized()) { + return true; + } else { + return false; + } +} + #ifdef PADDLE_WITH_CUDA cudaStream_t Tensor::stream() const { if (!stream_.IsStreamSet()) { diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 43bbc06787e9baf89dd059487531c4ecb6fd6f6a..4644e674ba4853f1ad5e4710c441d6bc73906635 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -100,8 +100,16 @@ if (WITH_GPU) endif() cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits) +set(BRPC_DEPS "") +if(WITH_PSLIB OR WITH_PSCORE) + set(BRPC_DEPS brpc) + if(WITH_PSLIB_BRPC) + set(BRPC_DEPS pslib_brpc) + endif() +endif() + cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits) -cc_library(device_worker SRCS device_worker.cc DEPS trainer_desc_proto lod_tensor scope) +cc_library(device_worker SRCS device_worker.cc DEPS trainer_desc_proto lod_tensor scope ${BRPC_DEPS}) cc_test(device_worker_test SRCS device_worker_test.cc DEPS device_worker) cc_library(scope_pool SRCS scope_pool.cc DEPS scope) @@ -191,13 +199,15 @@ if(WITH_PYTHON) py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto) #Generate an empty \ #__init__.py to make framework_py_proto as a valid python module. + add_custom_target(fleet_proto_init ALL + COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto + COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py + ) add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) - add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto) + add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init) if (NOT WIN32) add_custom_command(TARGET framework_py_proto POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto - COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto - COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/ COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto COMMENT "Copy generated python proto into directory paddle/fluid/proto." @@ -207,8 +217,6 @@ if(WITH_PYTHON) string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/") add_custom_command(TARGET framework_py_proto POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto - COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto - COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py COMMAND copy /Y *.py ${proto_dstpath} COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath} COMMENT "Copy generated python proto into directory paddle/fluid/proto." @@ -217,6 +225,12 @@ if(WITH_PYTHON) endif(NOT WIN32) endif() +if (WITH_PSCORE) + add_custom_target(index_dataset_proto_init ALL DEPENDS fleet_proto_init index_dataset_py_proto + COMMAND cp ${PADDLE_BINARY_DIR}/paddle/fluid/distributed/index_dataset/index_dataset_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto + COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto.") +endif(WITH_PSCORE) + cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) @@ -237,9 +251,16 @@ if(WITH_DISTRIBUTE) fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer lod_rank_table feed_fetch_method collective_helper ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper data_feed_proto timer monitor - heter_service_proto pslib_brpc) + heter_service_proto ${BRPC_DEP}) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(DISTRIBUTE_COMPILE_FLAGS + "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") + endif() set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(hetercpu_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(heterxpu_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) elseif(WITH_PSCORE) cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc @@ -274,7 +295,7 @@ elseif(WITH_PSLIB) pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method - graph_to_program_pass variable_helper timer monitor pslib_brpc ) + graph_to_program_pass variable_helper timer monitor ${BRPC_DEP}) else() cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc @@ -295,8 +316,14 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS fast_threaded_ssa_graph_executor variable_helper) cc_library(executor_cache SRCS executor_cache.cc DEPS executor) -cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS - conditional_block_op executor) +if(WITH_PSCORE) + get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) + cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS + conditional_block_op executor ${RPC_DEPS}) +else() + cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS + conditional_block_op executor) +endif() cc_library(prune SRCS prune.cc DEPS framework_proto boost) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry @@ -346,97 +373,20 @@ message(STATUS "branch: ${PADDLE_BRANCH}") configure_file(commit.h.in commit.h) +# Adapt to custom op mechanism: Include the header files related to the data type +# to avoid exposing the path of the underlying file +include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../extension/include) + cc_library(custom_tensor SRCS ../extension/src/ext_tensor.cc DEPS lod_tensor memory enforce) cc_library(op_meta_info SRCS ../extension/src/ext_op_meta_info.cc DEPS custom_tensor) cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper custom_tensor op_meta_info) cc_test(custom_tensor_test SRCS custom_tensor_test.cc DEPS custom_tensor glog) -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../extension/include) - set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator) cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES}) -# Old custom op extension mechanism related, will be removed in 2.1.0 -cc_library(paddle_framework_shared - SHARED SRCS executor.cc operator.cc - ${CMAKE_CURRENT_SOURCE_DIR}/c/c_api.cc - ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc - DEPS ${FLUID_FRAMEWORK_MODULES}) -get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) -set_target_properties(paddle_framework_shared PROPERTIES OUTPUT_NAME paddle_framework) -target_link_libraries(paddle_framework_shared ${os_dependency_modules}) - -if (LINUX) - set(FLUID_FRAMEWORK_SHARED_LIB - ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.so - CACHE INTERNAL "Fluid framework lib") -endif() - -if (WIN32) - if("${CMAKE_GENERATOR}" STREQUAL "Ninja") - set(paddle_framework_lib_path ${CMAKE_CURRENT_BINARY_DIR}) - else() - set(paddle_framework_lib_path ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}) - endif() - set(FLUID_FRAMEWORK_IMPORT_LIB - ${paddle_framework_lib_path}/paddle_framework.lib - CACHE INTERNAL "Fluid framework lib") - set(FLUID_FRAMEWORK_SHARED_LIB - ${paddle_framework_lib_path}/paddle_framework.dll - CACHE INTERNAL "Fluid framework dll") -endif() - -if(APPLE) - set(FLUID_FRAMEWORK_SHARED_LIB - ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.dylib - CACHE INTERNAL "Fluid framework lib") -endif() if(WITH_TESTING AND TEST selected_rows_test) set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120) endif() - -# New custom op extension mechanism related - -# if not deps `layer`, will cause: undefined symbol: _ZN6paddle10imperative7VarBase9name_set_ -set(PADDLE_CUSTOM_OP_MODULES custom_tensor op_meta_info custom_operator layer) - -set(PADDLE_CUSTOM_OP_SRCS - ${CMAKE_CURRENT_SOURCE_DIR}/custom_operator.cc - ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_tensor.cc - ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_op_meta_info.cc - ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc) -set(PADDLE_CUSTOM_OP_SRCS ${PADDLE_CUSTOM_OP_SRCS} PARENT_SCOPE) - -cc_library(paddle_custom_op_shared - SHARED SRCS ${PADDLE_CUSTOM_OP_SRCS} DEPS ${PADDLE_CUSTOM_OP_MODULES}) - -get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) -set_target_properties(paddle_custom_op_shared PROPERTIES OUTPUT_NAME paddle_custom_op) -target_link_libraries(paddle_custom_op_shared ${os_dependency_modules}) - -if (LINUX) - set(PADDLE_CUSTOM_OP_SHARED_LIB - ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_custom_op.so - CACHE INTERNAL "Paddle custom op lib") -endif() - -if (WIN32) - if("${CMAKE_GENERATOR}" STREQUAL "Ninja") - set(paddle_custom_op_lib_path ${CMAKE_CURRENT_BINARY_DIR}) - else() - set(paddle_custom_op_lib_path ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}) - endif() - set(PADDLE_CUSTOM_OP_IMPORT_LIB - ${paddle_custom_op_lib_path}/paddle_custom_op.lib - CACHE INTERNAL "Paddle custom op import lib") - set(PADDLE_CUSTOM_OP_SHARED_LIB - ${paddle_custom_op_lib_path}/paddle_custom_op.dll - CACHE INTERNAL "Paddle custom op dll") -endif() - -if(APPLE) - set(PADDLE_CUSTOM_OP_SHARED_LIB - ${PADDLE_BINARY_DIR}/paddle/fluid/framework/paddle_custom_op.dylib - CACHE INTERNAL "Paddle custom op lib") -endif() diff --git a/paddle/fluid/framework/c/c_api.cc b/paddle/fluid/framework/c/c_api.cc deleted file mode 100644 index 5e73c5cc23afa46506d03d96c893d55592f572de..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/c/c_api.cc +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/c/c_api.h" - -#include "paddle/fluid/framework/op_info.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" - -extern "C" { - -paddle::framework::OpInfoMap &PD_GetOpInfoMap() { - return paddle::framework::OpInfoMap::Instance(); -} - -void PD_InitDevicesPool(paddle::platform::DeviceContextPool *pool) { - paddle::platform::DeviceContextPool::SetPool(pool); -} - -std::vector PD_GetGradOpDescStrs( - const paddle::framework::OpDesc &op_desc, - const std::unordered_set &no_grad_set, - std::unordered_map *grad_to_var, - const std::vector &grad_block) { - auto &op_info = PD_GetOpInfoMap().Get(op_desc.Type()); - std::vector ret; - if (op_info.grad_op_maker_) { - auto grad_op_descs = - op_info.grad_op_maker_(op_desc, no_grad_set, grad_to_var, grad_block); - size_t op_num = grad_op_descs.size(); - ret.resize(op_num); - for (size_t i = 0; i < op_num; ++i) { - PADDLE_ENFORCE_EQ( - grad_op_descs[i]->Proto()->SerializePartialToString(&ret[i]), true, - paddle::platform::errors::Unavailable( - "Cannot serialize operator desc message.")); - } - } - return ret; -} - -} // end extern "C" diff --git a/paddle/fluid/framework/c/c_api.h b/paddle/fluid/framework/c/c_api.h deleted file mode 100644 index a9ec402f381e43b51887b6467d8d1baccf98ad37..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/c/c_api.h +++ /dev/null @@ -1,55 +0,0 @@ -/* copyright (c) 2019 paddlepaddle authors. all rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -#include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/op_desc.h" -#include "paddle/fluid/framework/op_info.h" -#include "paddle/fluid/platform/device_context.h" - -namespace paddle { -namespace framework { -class OpInfoMap; -} // namespace framework -namespace platform { -class DeviceContextPool; -} // namespace platform -} // namespace paddle - -#ifdef __cplusplus -extern "C" { -#endif - -// C-API to get global OpInfo map. -paddle::framework::OpInfoMap &PD_GetOpInfoMap(); - -// C-API to init global DeviceContextPool from outside. -void PD_InitDevicesPool(paddle::platform::DeviceContextPool *pool); - -// C-API to serialize the grad op protocol message to a binary string. -std::vector PD_GetGradOpDescStrs( - const paddle::framework::OpDesc &op_desc, - const std::unordered_set &no_grad_set, - std::unordered_map *grad_to_var, - const std::vector &grad_block); - -#ifdef __cplusplus -} -#endif diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index 66e28bb83ce3e42d0b7358bf462cb98e70617fe4..c4b833ec94c2940c6a655c2477845666ee543d7c 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -27,9 +27,7 @@ limitations under the License. */ #include "paddle/fluid/extension/include/ext_tensor.h" #include "paddle/fluid/framework/attribute.h" -#include "paddle/fluid/framework/c/c_api.h" #include "paddle/fluid/framework/custom_tensor_utils.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/op_meta_info_helper.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -63,6 +61,11 @@ inline bool IsGradVar(const std::string& var_name) { return var_name.rfind(suffix) != std::string::npos; } +inline bool IsDuplicableVar(const std::string& var_name) { + std::string suffix = kTensorVectorSuffix; + return var_name.rfind(suffix) != std::string::npos; +} + inline std::string NoGrad(const std::string& var_name) { std::string suffix = kGradVarSuffix; return var_name.substr(0, var_name.size() - kGradVarSuffixSize); @@ -103,19 +106,47 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx, const std::vector& attrs) { VLOG(1) << "Custom Operator: Start run KernelFunc."; std::vector custom_ins; + std::vector> custom_vec_ins; for (auto& in_name : inputs) { VLOG(1) << "Custom Operator: input name - " << in_name; - auto* x = ctx.Input(in_name); - PADDLE_ENFORCE_NOT_NULL(x, platform::errors::NotFound( - "Input tensor (%s) is nullptr.", in_name)); - PADDLE_ENFORCE_EQ(x->IsInitialized(), true, - platform::errors::InvalidArgument( - "Input tensor (%s) is not initialized.")); - auto custom_in = paddle::Tensor( - CustomTensorUtils::ConvertInnerPlaceToEnumPlace(x->place())); - CustomTensorUtils::ShareDataFrom(static_cast(x), custom_in); - CustomTensorUtils::SetTensorCurrentStream(&custom_in, ctx.GetPlace()); - custom_ins.emplace_back(custom_in); + if (detail::IsDuplicableVar(in_name)) { + // return const std::vector + auto vec_x = ctx.MultiInput(in_name); + PADDLE_ENFORCE_NE(vec_x.empty(), true, + platform::errors::NotFound( + "Input vector (%s) is empty.", in_name)); + std::vector custom_vec_in; + for (size_t i = 0; i < vec_x.size(); ++i) { + auto* x = vec_x[i]; + PADDLE_ENFORCE_NOT_NULL( + x, platform::errors::NotFound( + "The %d-th tensor in input vector (%s) is nullptr.", + i, in_name)); + PADDLE_ENFORCE_EQ(x->IsInitialized(), true, + platform::errors::InvalidArgument( + "The %d-th tensor in input vector (%s) " + "is not initialized.", + i, in_name)); + auto custom_t = paddle::Tensor( + CustomTensorUtils::ConvertInnerPlaceToEnumPlace(x->place())); + CustomTensorUtils::ShareDataFrom(static_cast(x), custom_t); + CustomTensorUtils::SetTensorCurrentStream(&custom_t, ctx.GetPlace()); + custom_vec_in.emplace_back(custom_t); + } + custom_vec_ins.emplace_back(custom_vec_in); + } else { + auto* x = ctx.Input(in_name); + PADDLE_ENFORCE_NOT_NULL(x, platform::errors::NotFound( + "Input tensor (%s) is nullptr.", in_name)); + PADDLE_ENFORCE_EQ(x->IsInitialized(), true, + platform::errors::InvalidArgument( + "Input tensor (%s) is not initialized.", in_name)); + auto custom_in = paddle::Tensor( + CustomTensorUtils::ConvertInnerPlaceToEnumPlace(x->place())); + CustomTensorUtils::ShareDataFrom(static_cast(x), custom_in); + CustomTensorUtils::SetTensorCurrentStream(&custom_in, ctx.GetPlace()); + custom_ins.emplace_back(custom_in); + } } std::vector custom_attrs; @@ -146,21 +177,41 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx, "Unsupported `%s` type value as custom attribute now. " "Supported data types include `bool`, `int`, `float`, " "`int64_t`, `std::string`, `std::vector`, " - "`std::vector`, `std::vector, " + "`std::vector`, `std::vector`, " "`std::vector`, Please check whether " "the attribute data type and data type string are matched.", attr_type_str)); } } - VLOG(1) << "Run ComputeFunc."; + VLOG(1) << "Custom Operator: Run ComputeFunc."; try { - auto outs = func(custom_ins, custom_attrs); + auto outs = func(custom_ins, custom_vec_ins, custom_attrs); VLOG(1) << "Custom Operator: Share outputs into ExecutionContext."; for (size_t i = 0; i < outputs.size(); ++i) { - auto* true_out = ctx.Output(outputs[i]); - CustomTensorUtils::ShareDataTo(outs.at(i), true_out); + auto out_name = outputs[i]; + if (detail::IsDuplicableVar(out_name)) { + PADDLE_ENFORCE(i == 0UL && outputs.size() == 1UL, + platform::errors::PreconditionNotMet( + "If custom operator's outputs contains `paddle::Vec(" + ")` type, " + "it only can hold one output.")); + auto vec_true_outs = ctx.MultiOutput(out_name); + PADDLE_ENFORCE_EQ( + vec_true_outs.size(), outs.size(), + platform::errors::InvalidArgument( + "The number of element in custom operator outputs is wrong, " + "expected contains %d Tensors, but actually contains %d " + "Tensors.", + vec_true_outs.size(), outs.size())); + for (size_t j = 0; j < vec_true_outs.size(); ++j) { + CustomTensorUtils::ShareDataTo(outs.at(j), vec_true_outs.at(j)); + } + } else { + auto* true_out = ctx.Output(out_name); + CustomTensorUtils::ShareDataTo(outs.at(i), true_out); + } } } catch (platform::EnforceNotMet& exception) { throw std::move(exception); @@ -195,7 +246,7 @@ class CustomOperator : public OperatorWithKernel { * it can only be determined at runtime. */ framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const { + const framework::ExecutionContext& ctx) const override { return framework::OpKernelType(proto::VarType::RAW, ctx.GetPlace()); } @@ -206,7 +257,7 @@ class CustomOperator : public OperatorWithKernel { */ framework::OpKernelType GetKernelTypeForVar( const std::string& var_name, const Tensor& tensor, - const OpKernelType& expected_kernel_type) { + const OpKernelType& expected_kernel_type) const override { return OpKernelType(expected_kernel_type.data_type_, expected_kernel_type.place_, tensor.layout()); } @@ -221,10 +272,20 @@ class CustomOpMaker : public OpProtoAndCheckerMaker { void Make() override { for (auto& in_name : inputs_) { - AddInput(in_name, "The input " + in_name + "of Custom operator."); + if (detail::IsDuplicableVar(in_name)) { + AddInput(in_name, "The input " + in_name + "of Custom operator.") + .AsDuplicable(); + } else { + AddInput(in_name, "The input " + in_name + "of Custom operator."); + } } for (auto& out_name : outputs_) { - AddOutput(out_name, "The output " + out_name + "of Custom Operator."); + if (detail::IsDuplicableVar(out_name)) { + AddOutput(out_name, "The output " + out_name + "of Custom Operator.") + .AsDuplicable(); + } else { + AddOutput(out_name, "The output " + out_name + "of Custom Operator."); + } } for (auto& attr : attrs_) { auto attr_name_and_type = detail::ParseAttrStr(attr); @@ -265,7 +326,7 @@ class CustomOpMaker : public OpProtoAndCheckerMaker { "Unsupported `%s` type value as custom attribute now. " "Supported data types include `bool`, `int`, `float`, " "`int64_t`, `std::string`, `std::vector`, " - "`std::vector`, `std::vector, " + "`std::vector`, `std::vector`, " "`std::vector`, Please check whether " "the attribute data type and data type string are matched.", attr_type_str)); @@ -331,7 +392,13 @@ class CustomGradOpMaker : public SingleGradOpMaker { } for (auto& out_name : outputs_) { VLOG(1) << "Custom Operator: GradOpDescMaker - output: " << out_name; - grad_op->SetOutput(out_name, this->InputGrad(detail::NoGrad(out_name))); + if (detail::IsDuplicableVar(out_name)) { + grad_op->SetOutput(out_name, + this->InputGrad(detail::NoGrad(out_name), + /*drop_empty_grad=*/false)); + } else { + grad_op->SetOutput(out_name, this->InputGrad(detail::NoGrad(out_name))); + } } grad_op->SetAttrMap(this->Attrs()); } @@ -493,9 +560,9 @@ void RegisterOperatorWithMetaInfo( platform::errors::Unavailable( "Your custom operator contains multiple inputs. " "We only allow a custom operator that contains only one input " - "and " - "only one output without setting the InferShapeFn. At this time, " - "the input shape will be directly set to the output shape.\n" + "and only one output without setting the InferShapeFn. " + "At this time, the input shape will be directly set to " + "the output shape.\n" "Please set the InferShapeFn of custom " "operator by .SetInferShapeFn(PD_INFER_SHAPE(...))")); PADDLE_ENFORCE_EQ( @@ -503,9 +570,9 @@ void RegisterOperatorWithMetaInfo( platform::errors::Unavailable( "Your custom operator contains multiple outputs. " "We only allow a custom operator that contains only one input " - "and " - "only one output without setting the InferShapeFn. At this time, " - "the input shape will be directly set to the output shape.\n" + "and only one output without setting the InferShapeFn. " + "At this time, the input shape will be directly set to " + "the output shape.\n" "Please set the InferShapeFn of custom " "operator by .SetInferShapeFn(PD_INFER_SHAPE(...))")); @@ -513,24 +580,91 @@ void RegisterOperatorWithMetaInfo( ctx->ShareDim(op_inputs[0], op_outputs[0]); }; } else { - info.infer_shape_ = [op_inputs, op_outputs, + info.infer_shape_ = [op_inputs, op_outputs, op_attrs, infer_shape_func](InferShapeContext* ctx) { std::vector> input_shapes; + std::vector>> vec_input_shapes; VLOG(1) << "Custom Operator: InferShape - get input ddim."; for (auto& in_name : op_inputs) { - OP_INOUT_CHECK(ctx->HasInput(in_name), "Input", in_name, "Custom"); - auto ddim = ctx->GetInputDim(in_name); - input_shapes.emplace_back(framework::vectorize(ddim)); + if (detail::IsDuplicableVar(in_name)) { + OP_INOUT_CHECK(ctx->HasInputs(in_name), "Input", in_name, "Custom"); + auto vec_ddim = ctx->GetInputsDim(in_name); + std::vector> vec_shape; + vec_shape.reserve(vec_ddim.size()); + std::transform(vec_ddim.begin(), vec_ddim.end(), + std::back_inserter(vec_shape), + [&](const DDim& ddim) -> std::vector { + return framework::vectorize(ddim); + }); + vec_input_shapes.emplace_back(vec_shape); + } else { + OP_INOUT_CHECK(ctx->HasInput(in_name), "Input", in_name, "Custom"); + auto ddim = ctx->GetInputDim(in_name); + input_shapes.emplace_back(framework::vectorize(ddim)); + } + } + + std::vector custom_attrs; + for (auto& attr_str : op_attrs) { + auto attr_name_and_type = detail::ParseAttrStr(attr_str); + auto attr_name = attr_name_and_type[0]; + auto attr_type_str = attr_name_and_type[1]; + if (attr_type_str == "bool") { + custom_attrs.emplace_back(ctx->Attrs().Get(attr_name)); + } else if (attr_type_str == "int") { + custom_attrs.emplace_back(ctx->Attrs().Get(attr_name)); + } else if (attr_type_str == "float") { + custom_attrs.emplace_back(ctx->Attrs().Get(attr_name)); + } else if (attr_type_str == "int64_t") { + custom_attrs.emplace_back(ctx->Attrs().Get(attr_name)); + } else if (attr_type_str == "std::string") { + custom_attrs.emplace_back(ctx->Attrs().Get(attr_name)); + } else if (attr_type_str == "std::vector") { + custom_attrs.emplace_back( + ctx->Attrs().Get>(attr_name)); + } else if (attr_type_str == "std::vector") { + custom_attrs.emplace_back( + ctx->Attrs().Get>(attr_name)); + } else if (attr_type_str == "std::vector") { + // NOTE(chenweihang): InferShape can't support std::vector + // attr type, because the input type is std::vector, only + // can use one rule to parse std::vector parameter + continue; + } else if (attr_type_str == "std::vector") { + custom_attrs.emplace_back( + ctx->Attrs().Get>(attr_name)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported `%s` type value as custom attribute now. " + "Supported data types include `bool`, `int`, `float`, " + "`int64_t`, `std::string`, `std::vector`, " + "`std::vector`, `std::vector`, " + "Please check whether the attribute data type and " + "data type string are matched.", + attr_type_str)); + } } VLOG(1) << "Custom Operator: InferShape - calc output ddim."; - auto output_shapes = infer_shape_func(input_shapes); + auto output_shapes = + infer_shape_func(input_shapes, vec_input_shapes, custom_attrs); VLOG(1) << "Custom Operator: InferShape - set output ddim."; for (size_t i = 0; i < op_outputs.size(); ++i) { - ctx->SetOutputDim(op_outputs[i], - framework::make_ddim(output_shapes[i])); + auto out_name = op_outputs[i]; + if (detail::IsDuplicableVar(out_name)) { + std::vector vec_ddim; + vec_ddim.reserve(output_shapes.size()); + std::transform(output_shapes.begin(), output_shapes.end(), + std::back_inserter(vec_ddim), + [&](const std::vector& shape) -> DDim { + return framework::make_ddim(shape); + }); + ctx->SetOutputsDim(out_name, vec_ddim); + } else { + ctx->SetOutputDim(out_name, framework::make_ddim(output_shapes[i])); + } } }; } @@ -544,9 +678,9 @@ void RegisterOperatorWithMetaInfo( platform::errors::Unavailable( "Your custom operator contains multiple inputs. " "We only allow a custom operator that contains only one input " - "and " - "only one output without setting the InferDtypeFn. At this time, " - "the input dtype will be directly set to the output dtype.\n" + "and only one output without setting the InferDtypeFn. " + "At this time, the input dtype will be directly set to " + "the output dtype.\n" "Please set the InferDtypeFn of custom " "operator by .SetInferDtypeFn(PD_INFER_DTYPE(...))")); PADDLE_ENFORCE_EQ( @@ -554,9 +688,9 @@ void RegisterOperatorWithMetaInfo( platform::errors::Unavailable( "Your custom operator contains multiple outputs. " "We only allow a custom operator that contains only one input " - "and " - "only one output without setting the InferDtypeFn. At this time, " - "the input dtype will be directly set to the output dtype.\n" + "and only one output without setting the InferDtypeFn. " + "At this time, the input dtype will be directly set to " + "the output dtype.\n" "Please set the InferDtypeFn of custom " "operator by .SetInferDtypeFn(PD_INFER_DTYPE(...))")); @@ -568,22 +702,42 @@ void RegisterOperatorWithMetaInfo( info.infer_var_type_ = [op_inputs, op_outputs, infer_dtype_func](InferVarTypeContext* ctx) { std::vector input_dtypes; + std::vector> vec_input_dtypes; VLOG(1) << "Custom Operator: InferDtype - get input dtype."; for (auto& in_name : op_inputs) { - auto dtype = ctx->GetInputDataType(in_name); - input_dtypes.emplace_back( - CustomTensorUtils::ConvertInnerDTypeToEnumDType(dtype)); + if (detail::IsDuplicableVar(in_name)) { + std::vector vec_custom_dtype; + for (size_t i = 0; i < ctx->InputSize(in_name); ++i) { + auto dtype = ctx->GetInputDataType(in_name, i); + vec_custom_dtype.emplace_back( + CustomTensorUtils::ConvertInnerDTypeToEnumDType(dtype)); + } + vec_input_dtypes.emplace_back(vec_custom_dtype); + } else { + auto dtype = ctx->GetInputDataType(in_name); + input_dtypes.emplace_back( + CustomTensorUtils::ConvertInnerDTypeToEnumDType(dtype)); + } } VLOG(1) << "Custom Operator: InferDtype - infer output dtype."; - auto output_dtypes = infer_dtype_func(input_dtypes); + auto output_dtypes = infer_dtype_func(input_dtypes, vec_input_dtypes); VLOG(1) << "Custom Operator: InferDtype - set output dtype."; for (size_t i = 0; i < op_outputs.size(); ++i) { - ctx->SetOutputDataType( - op_outputs[i], - CustomTensorUtils::ConvertEnumDTypeToInnerDType(output_dtypes[i])); + auto out_name = op_outputs[i]; + if (detail::IsDuplicableVar(out_name)) { + for (size_t j = 0; j < output_dtypes.size(); ++j) { + auto dtype = CustomTensorUtils::ConvertEnumDTypeToInnerDType( + output_dtypes[i]); + ctx->SetOutputDataType(out_name, dtype, j); + } + } else { + ctx->SetOutputDataType( + out_name, CustomTensorUtils::ConvertEnumDTypeToInnerDType( + output_dtypes[i])); + } } }; } @@ -644,10 +798,39 @@ void RegisterOperatorWithMetaInfo( return new CustomOperator(type, inputs, outputs, attrs); }; - // Grad InferShape (gradient's shape is same with forward input default) - grad_info.infer_shape_ = [grad_op_outputs](InferShapeContext* ctx) { + // Grad InferShape + grad_info.infer_shape_ = [grad_op_inputs, + grad_op_outputs](InferShapeContext* ctx) { + // 1. if forward input exists, gradient's shape is same with forward input + // default + // [Suitable for most situations] + // 2. if forward input not exists, and only contains one grad input and + // output, + // use grad input shape as grad output shape + // [Suitable for the situation that forward input is not used as + // backward input] + // TODO(chenweihang): support set grad op infershape func if needed for (auto& out_name : grad_op_outputs) { - ctx->ShareDim(detail::NoGrad(out_name), out_name); + auto fwd_name = detail::NoGrad(out_name); + if (detail::IsDuplicableVar(fwd_name)) { + // Duplicable forward var must as backward input + ctx->ShareDim(fwd_name, out_name); + } else { + if (ctx->HasInput(fwd_name)) { + ctx->ShareDim(fwd_name, out_name); + } else { + PADDLE_ENFORCE_EQ( + grad_op_inputs.size() == 1UL && grad_op_outputs.size() == 1UL, + true, + platform::errors::Unavailable( + "Custom grad operator infershape error. " + "If a custom grad operator contains only one input and " + "only one output, the input shape will be directly set to " + "the output shape. Otherwise, Please set the forward input " + "as the grad operator's input.")); + ctx->ShareDim(grad_op_inputs[0], out_name); + } + } } }; diff --git a/paddle/fluid/framework/custom_operator.h b/paddle/fluid/framework/custom_operator.h index 117841f80cf47ed95251fee1d01f7fd87caa600b..259901c09f3e00729876d7bea062237ad5bad94a 100644 --- a/paddle/fluid/framework/custom_operator.h +++ b/paddle/fluid/framework/custom_operator.h @@ -28,5 +28,8 @@ void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name); void RegisterOperatorWithMetaInfoMap( const paddle::OpMetaInfoMap& op_meta_info_map); +// Interface for selective register custom op. +void RegisterOperatorWithMetaInfo(const std::vector& op_meta_infos); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc index 2e42248f64bec7d3d30271a61688e8530bf34b83..a65dcbd55f94630612ce59b4d07b0789aaf7c697 100644 --- a/paddle/fluid/framework/custom_tensor_test.cc +++ b/paddle/fluid/framework/custom_tensor_test.cc @@ -109,6 +109,12 @@ void GroupTestCopy() { TestCopyTensor(); VLOG(2) << "uint8 cpu-cpu-gpu-gpu-cpu"; TestCopyTensor(); + VLOG(2) << "complex64 cpu-cpu-gpu-gpu-cpu"; + TestCopyTensor(); + VLOG(2) << "complex128 cpu-cpu-gpu-gpu-cpu"; + TestCopyTensor(); + VLOG(2) << "Fp16 cpu-cpu-gpu-gpu-cpu"; + TestCopyTensor(); } void GroupTestCast() { @@ -126,6 +132,12 @@ void GroupTestCast() { TestCast(paddle::DataType::FLOAT32); VLOG(2) << "float cast"; TestCast(paddle::DataType::FLOAT32); + VLOG(2) << "complex64 cast"; + TestCast(paddle::DataType::FLOAT32); + VLOG(2) << "complex128 cast"; + TestCast(paddle::DataType::FLOAT32); + VLOG(2) << "float16 cast"; + TestCast(paddle::DataType::FLOAT16); } void GroupTestDtype() { @@ -136,6 +148,9 @@ void GroupTestDtype() { CHECK(TestDtype() == paddle::DataType::INT16); CHECK(TestDtype() == paddle::DataType::INT8); CHECK(TestDtype() == paddle::DataType::UINT8); + CHECK(TestDtype() == paddle::DataType::COMPLEX64); + CHECK(TestDtype() == paddle::DataType::COMPLEX128); + CHECK(TestDtype() == paddle::DataType::FLOAT16); } void GroupTestDtypeConvert() { @@ -162,6 +177,15 @@ void GroupTestDtypeConvert() { paddle::framework::proto::VarType::INT16); CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType( paddle::DataType::BOOL) == paddle::framework::proto::VarType::BOOL); + CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType( + paddle::DataType::COMPLEX64) == + paddle::framework::proto::VarType::COMPLEX64); + CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType( + paddle::DataType::COMPLEX128) == + paddle::framework::proto::VarType::COMPLEX128); + CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType( + paddle::DataType::FLOAT16) == + paddle::framework::proto::VarType::FP16); // proto -> enum CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType( paddle::framework::proto::VarType::FP64) == @@ -185,6 +209,30 @@ void GroupTestDtypeConvert() { paddle::DataType::INT16); CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType( paddle::framework::proto::VarType::BOOL) == paddle::DataType::BOOL); + CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType( + paddle::framework::proto::VarType::COMPLEX64) == + paddle::DataType::COMPLEX64); + CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType( + paddle::framework::proto::VarType::COMPLEX128) == + paddle::DataType::COMPLEX128); + CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType( + paddle::framework::proto::VarType::FP16) == + paddle::DataType::FLOAT16); +} + +void TestInitilized() { + paddle::Tensor test_tensor(paddle::PlaceType::kCPU); + CHECK(test_tensor.is_initialized() == false); + test_tensor.reshape({1, 1}); + test_tensor.mutable_data(); + CHECK(test_tensor.is_initialized() == true); + float* tensor_data = test_tensor.data(); + for (int i = 0; i < test_tensor.size(); i++) { + tensor_data[i] = 0.5; + } + for (int i = 0; i < test_tensor.size(); i++) { + CHECK(tensor_data[i] == 0.5); + } } TEST(CustomTensor, copyTest) { @@ -200,4 +248,6 @@ TEST(CustomTensor, copyTest) { GroupTestCast(); VLOG(2) << "TestDtypeConvert"; GroupTestDtypeConvert(); + VLOG(2) << "TestInitilized"; + TestInitilized(); } diff --git a/paddle/fluid/framework/custom_tensor_utils.h b/paddle/fluid/framework/custom_tensor_utils.h index 919a3a1a49c73b9a3e06265485ef08c7108a8082..809a6b965aad9bcb4594ecff99e460db723dfd53 100644 --- a/paddle/fluid/framework/custom_tensor_utils.h +++ b/paddle/fluid/framework/custom_tensor_utils.h @@ -37,7 +37,7 @@ class CustomTensorUtils { /// \brief Share data FROM another tensor. /// Use this to pass tensor from op to op /// \return void. - static void ShareDataFrom(const void* src, const Tensor& dst); + static void ShareDataFrom(const void* src, const paddle::Tensor& dst); static framework::proto::VarType::Type ConvertEnumDTypeToInnerDType( const paddle::DataType& dtype) { @@ -56,6 +56,12 @@ class CustomTensorUtils { return framework::proto::VarType::INT64; case paddle::DataType::INT16: return framework::proto::VarType::INT16; + case paddle::DataType::COMPLEX64: + return framework::proto::VarType::COMPLEX64; + case paddle::DataType::COMPLEX128: + return framework::proto::VarType::COMPLEX128; + case paddle::DataType::FLOAT16: + return framework::proto::VarType::FP16; case paddle::DataType::BOOL: return framework::proto::VarType::BOOL; default: @@ -83,6 +89,12 @@ class CustomTensorUtils { return paddle::DataType::UINT8; case framework::proto::VarType::INT16: return paddle::DataType::INT16; + case framework::proto::VarType::COMPLEX64: + return paddle::DataType::COMPLEX64; + case framework::proto::VarType::COMPLEX128: + return paddle::DataType::COMPLEX128; + case framework::proto::VarType::FP16: + return paddle::DataType::FLOAT16; case framework::proto::VarType::BOOL: return paddle::DataType::BOOL; default: diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h index 7aa7b7b2d96cf93dd2c0e3ba34b14307c230fa81..c8f73a5469ab32a5734d980010a52a6f72eb6ca8 100644 --- a/paddle/fluid/framework/data_type.h +++ b/paddle/fluid/framework/data_type.h @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/complex128.h" #include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/eigen_ext.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 34c87b8388975aa108bbb2ecdaded1ff4a33d16c..5636e3ed1b63f9e4b9854ebcaac4a84a4c20176b 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -161,9 +161,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ !defined(_WIN32) && !defined(__APPLE__) AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass"); -#else - LOG(WARNING) << "fusion_group is not enabled for Windows/MacOS now, and " - "only effective when running with CUDA GPU."; #endif AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_, "fuse_elewise_add_act_pass"); @@ -265,12 +262,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { if (FLAGS_use_mkldnn) { AppendPass(pass_name); } else if (!strategy_.mkldnn_enabled_op_types_.empty()) { - LOG(WARNING) - << "mkldnn_enabled_op_types specify the operator type list to " - "use MKLDNN acceleration. It is null in default, means " - "that all the operators supported by MKLDNN will be " - "accelerated. And it should not be set when " - "FLAGS_use_mkldnn=false."; + VLOG(1) << "mkldnn_enabled_op_types specify the operator type list to " + "use MKLDNN acceleration. It is null in default, means " + "that all the operators supported by MKLDNN will be " + "accelerated. And it should not be set when " + "FLAGS_use_mkldnn=false."; } #else PADDLE_ENFORCE_NE(FLAGS_use_mkldnn, true, @@ -403,26 +399,26 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, << ", num_trainers:" << num_trainers_; } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") { if (use_device != p::kCUDA) { - LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on " - "GPU, skipped."; + VLOG(1) << "fuse_relu_depthwise_conv_pass is only supported on " + "GPU, skipped."; continue; } } else if (pass->Type() == "fusion_group_pass") { pass->Set("use_gpu", new bool((use_device == p::kCUDA))); if (use_device != p::kCUDA) { - LOG(WARNING) << "fusion_group_pass is only supported on GPU, skipped."; + VLOG(1) << "fusion_group_pass is only supported on GPU, skipped."; continue; } } else if (pass->Type() == "fuse_bn_act_pass") { if (use_device != p::kCUDA) { - LOG(WARNING) << "fuse_bn_act_pass is only supported on " - "GPU, skipped."; + VLOG(1) << "fuse_bn_act_pass is only supported on " + "GPU, skipped."; continue; } } else if (pass->Type() == "fuse_bn_add_act_pass") { if (use_device != p::kCUDA) { - LOG(WARNING) << "fuse_bn_add_act_pass is only supported on " - "GPU, skipped."; + VLOG(1) << "fuse_bn_add_act_pass is only supported on " + "GPU, skipped."; continue; } } else if (pass->Type() == "mkldnn_placement_pass") { diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index 103dd0c5ae599b8126ef63fb8ae456846a2f1966..0fdb97db20af992998d94e37263f415a84cd1ba1 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -354,8 +354,36 @@ void CheckVarHasNanOrInf(const std::string& op_type, var_name)); #endif return; - } + } else if (platform::is_npu_place(tensor->place())) { +#ifdef PADDLE_WITH_ASCEND_CL + if (tensor->type() != proto::VarType::FP32) { + return; + } + + framework::LoDTensor cpu_tensor; + cpu_tensor.Resize(tensor->dims()); + float* cpu_data = static_cast( + cpu_tensor.mutable_data(platform::CPUPlace(), tensor->type())); + framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor); + bool flag = false; + for (int i = 0; i < cpu_tensor.numel(); i++) { + if (isnan(cpu_data[i]) || isinf(cpu_data[i])) { + flag = true; + break; + } + } + PADDLE_ENFORCE_NE( + flag, true, + platform::errors::Fatal("Operator %s output Tensor %s contains Inf.", + op_type, var_name)); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "Tensor[%s] use npu place. PaddlePaddle must compile with NPU.", + var_name)); +#endif + return; + } tensor_check(op_type, var_name, *tensor, place); } diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 3038719539251cd6e5aec5692aab2b695b4212cd..84369011476c77765dc5396830adc34f775fbb50 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -28,7 +28,8 @@ limitations under the License. */ #include #include "paddle/fluid/framework/data_feed.h" -#include "paddle/fluid/framework/heter_service.h" +#include "paddle/fluid/framework/executor_gc_helper.h" +#include "paddle/fluid/framework/heter_util.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/program_desc.h" @@ -204,7 +205,7 @@ class DeviceWorker { Scope* root_scope_ = nullptr; Scope* thread_scope_; paddle::platform::Place place_; - int64_t batch_num_; + int64_t batch_num_ = 0; FetchConfig fetch_config_; bool use_cvm_; bool no_cvm_; @@ -265,6 +266,9 @@ class HogwildWorker : public CPUWorkerBase { HogwildWorkerParameter param_; std::vector skip_ops_; std::map stat_var_name_map_; +#ifdef PADDLE_WITH_HETERPS + platform::DeviceContext* dev_ctx_ = nullptr; +#endif }; class DownpourWorker : public HogwildWorker { @@ -454,7 +458,7 @@ class HeterBoxWorker : public HogwildWorker { virtual void CacheProgram(const ProgramDesc& main_program) { new (&program_) ProgramDesc(main_program); } - virtual void ProduceTasks() override; + void ProduceTasks() override; virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; } virtual void SetEvent(const gpuEvent_t event) { event_ = event; } virtual void TrainFilesWithProfiler() {} @@ -555,13 +559,12 @@ class PSGPUWorker : public HogwildWorker { virtual void CacheProgram(const ProgramDesc& main_program) { new (&program_) ProgramDesc(main_program); } - virtual void ProduceTasks() override; + void ProduceTasks() override; virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; } virtual void SetEvent(const gpuEvent_t event) { event_ = event; } void ResetStat(); protected: - std::shared_ptr fleet_ptr_; void PushGradients(); void DumpParam(); void CopySparseTable(); @@ -638,7 +641,8 @@ class PSGPUWorker : public HogwildWorker { }; #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_ASCEND_CL) class SectionWorker : public DeviceWorker { public: SectionWorker() {} @@ -659,6 +663,9 @@ class SectionWorker : public DeviceWorker { void SetDeviceIndex(int tid) override {} void SetThreadIndex(int thread_id) { thread_id_ = thread_id; } void SetMicrobatchNum(int num) { num_microbatches_ = num; } + void SetPipelineStageNum(int num) { num_pipeline_stages_ = num; } + void SetPipelineStage(int stage) { pipeline_stage_ = stage; } + void SetScheduleMode(int mode) { schedule_mode_ = mode; } void SetMicrobatchScopes(const std::vector& scope) { microbatch_scopes_ = scope; } @@ -666,11 +673,23 @@ class SectionWorker : public DeviceWorker { void SetSkipVars(const std::vector& skip_vars) { skip_vars_ = skip_vars; } + void RunBackward( + int micro_id, std::unique_ptr&, + std::unordered_map>&); + void RunForward( + int micro_id, std::unique_ptr&, + std::unordered_map>&); + void RunUpdate( + std::unique_ptr&, + std::unordered_map>&); protected: int section_id_; int thread_id_; int num_microbatches_; + int num_pipeline_stages_; + int pipeline_stage_; + int schedule_mode_; // 0 for F-then-B and 1 for 1F1B std::vector microbatch_scopes_; std::vector skip_vars_; const Scope* minibatch_scope_; diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc index a539a5d5f96b52eea852bc39b0081ea92ccfffc1..fb2323d96e2916f0b5c5a62fdb7c27341924bbe0 100644 --- a/paddle/fluid/framework/device_worker_factory.cc +++ b/paddle/fluid/framework/device_worker_factory.cc @@ -79,7 +79,8 @@ REGISTER_DEVICE_WORKER_CLASS(HeterBoxWorker); REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_ASCEND_CL) REGISTER_DEVICE_WORKER_CLASS(SectionWorker); #endif } // namespace framework diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 300f0eb0dbb50fa1437ac05bd01b8d27f3aaef34..d102fcdbe0cec14144d54f82e62760e8a1ceaec2 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -29,9 +29,24 @@ message RecomputeConfig { } message ShardingConfig { - optional float fuse_broadcast_MB = 1 [ default = 32.0 ]; - optional bool hybrid_dp = 2 [ default = false ]; - optional int32 sharding_group_size = 3 [ default = 8 ]; + optional string sharding_segment_strategy = 1 + [ default = 'segment_broadcast_MB' ]; + optional float segment_broadcast_MB = 2 [ default = 32.0 ]; + repeated string segment_anchors = 3; + optional int32 sharding_degree = 4 [ default = 8 ]; + optional int32 mp_degree = 5 [ default = 1 ]; + optional int32 dp_degree = 6 [ default = 1 ]; + optional bool hybrid_dp = 7 [ default = false ]; + optional int32 gradient_merge_acc_step = 8 [ default = 1 ]; + optional bool optimize_offload = 9 [ default = false ]; + optional bool pp_allreduce_in_optimize = 10 [ default = false ]; + optional int32 pp_degree = 11 [ default = 1 ]; +} + +message HybridConfig { + optional int32 dp_degree = 1 [ default = -1 ]; + optional int32 mp_degree = 2 [ default = 1 ]; + optional int32 pp_degree = 3 [ default = 1 ]; } message AMPConfig { @@ -115,11 +130,17 @@ message AsyncConfig { optional bool launch_barrier = 9 [ default = true ]; optional string heter_worker_device_guard = 10 [ default = 'cpu' ]; optional int32 lr_decay_steps = 11 [ default = 10 ]; + optional int32 use_ps_gpu = 12 [ default = 0 ]; } message PipelineConfig { optional int32 micro_batch_size = 1 [ default = 1 ]; optional int32 accumulate_steps = 2 [ default = 1 ]; + optional string schedule_mode = 3 [ default = '1F1B' ]; +} + +message TensorParallelConfig { + optional int32 tensor_parallel_degree = 1 [ default = 1 ]; } message DistributedStrategy { @@ -151,6 +172,9 @@ message DistributedStrategy { optional bool fp16_allreduce = 25 [ default = false ]; optional bool sharding = 26 [ default = false ]; optional float last_comm_group_size_MB = 27 [ default = 1 ]; + optional bool find_unused_parameters = 28 [ default = false ]; + optional bool tensor_parallel = 29 [ default = false ]; + optional bool without_graph_optimization = 30 [ default = false ]; optional RecomputeConfig recompute_configs = 101; optional AMPConfig amp_configs = 102; @@ -163,6 +187,8 @@ message DistributedStrategy { optional LambConfig lamb_configs = 109; optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110; optional ShardingConfig sharding_configs = 111; + optional HybridConfig hybrid_configs = 112; + optional TensorParallelConfig tensor_parallel_configs = 113; optional BuildStrategy build_strategy = 201; optional ExecutionStrategy execution_strategy = 202; } diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index a3fbb008fe4f444b7ad5b1fb3eb695ca4b4c7796..b99ab6b5a7ff195ef7d659598df88467bb158c6e 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -82,6 +82,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { platform::errors::Unimplemented("platform::XPUPlace is not supported")); } + inline ::DLContext operator()(const platform::NPUPlace &place) const { + PADDLE_THROW( + platform::errors::Unimplemented("platform::NPUPlace is not supported")); + } + inline ::DLContext operator()(const platform::CUDAPlace &place) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ::DLContext ctx; diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 0acc8a55fa9f8a79c67b7beb732996c86f86ec5a..de007c128d7543c1433426e80abcbd80ee47dee8 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -72,7 +72,7 @@ Executor::~Executor() { #ifdef PADDLE_WITH_MKLDNN // Clear mkl-dnn cache, // this is needed to have mkl-dnn unit tests working - ClearMKLDNNCache(place_); + ClearMKLDNNCache(place_, this); #endif } @@ -169,6 +169,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool force_disable_gc, bool keep_kid_scopes) { platform::RecordBlock b(block_id); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); +#ifdef PADDLE_WITH_MKLDNN + platform::AttachPointerHashToMKLDNNKey(this, place_); +#endif auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars, keep_kid_scopes); @@ -294,6 +297,9 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, const std::string& fetch_holder_name) { platform::RecordBlock b(kProgramId); if (FLAGS_use_mkldnn) EnableMKLDNN(program); +#ifdef PADDLE_WITH_MKLDNN + platform::AttachPointerHashToMKLDNNKey(this, place_); +#endif bool has_feed_ops = has_feed_operators(program.Block(0), *feed_targets, feed_holder_name); bool has_fetch_ops = @@ -453,6 +459,25 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, #else PADDLE_THROW( platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle")); +#endif + } else if (platform::is_npu_place(place_)) { +#ifdef PADDLE_WITH_ASCEND_CL + if (IsFastEagerDeletionModeEnabled()) { + VLOG(4) << "Use unsafe fast gc for NPU."; + gc.reset(new NPUUnsafeFastGarbageCollector( + BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Please set FLAGS_fast_eager_deletion_mode=true to use " + "GarbageCollector on NPU.")); + // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector. + VLOG(4) << "Use default stream gc for NPU."; + gc.reset(new NPUDefaultStreamGarbageCollector( + BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size)); + } +#else + PADDLE_THROW( + platform::errors::Unimplemented("No NPU gc found in CPU/NPU paddle")); #endif } } @@ -557,7 +582,6 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) { } } } - platform::AttachPointerHashToMKLDNNKey(this, place_); #else LOG(WARNING) << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option"; diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 7593b60abfffcd9a0a3e9f743930660327c1409e..9c9f29520de439ee209ced19f448bde9905b231b 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -20,14 +20,12 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/trainer.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { diff --git a/paddle/fluid/framework/executor_cache.h b/paddle/fluid/framework/executor_cache.h index 782018d1cfe109c3a0cb4919969665207dcfbc9e..3beeacb1010d2687ac0dfd58092773f52c4fafdc 100644 --- a/paddle/fluid/framework/executor_cache.h +++ b/paddle/fluid/framework/executor_cache.h @@ -22,8 +22,10 @@ #include #include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/string/string_helper.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc index c8bc735790400bbc1552c294602275bbf9ab90d4..c06a3d4a183799c7c8ca130f9ff48e7bff23a3bd 100644 --- a/paddle/fluid/framework/executor_gc_helper.cc +++ b/paddle/fluid/framework/executor_gc_helper.cc @@ -18,7 +18,6 @@ #include "glog/logging.h" #include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index 61f3c026f1facc6afb2b9b45316b1205cf676904..a9e4691dd0a01544e1d75d3d27dce43585081837 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -1,20 +1,27 @@ if(WITH_PSLIB) - cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope pslib_brpc pslib) + if(WITH_PSLIB_BRPC) + set(BRPC_DEPS pslib_brpc) + else() + set(BRPC_DEPS brpc) + endif(WITH_PSLIB_BRPC) + cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope ${BRPC_DEPS} pslib) +else() + cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope) +endif(WITH_PSLIB) + +if(WITH_HETERPS) if(WITH_NCCL) nv_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc - DEPS heter_ps) + DEPS heter_ps ${BRPC_DEPS}) add_subdirectory(heter_ps) elseif(WITH_RCCL) hip_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc - DEPS heter_ps) + DEPS heter_ps ${BRPC_DEPS}) add_subdirectory(heter_ps) - else() - cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc) endif(WITH_NCCL) else() - cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope) cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc) -endif(WITH_PSLIB) +endif(WITH_HETERPS) if(WITH_NCCL OR WITH_RCCL) cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope) @@ -37,10 +44,20 @@ else() cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope) endif(WITH_GLOO) -cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context heter_service_proto) +if(WITH_PSLIB) +set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") +if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(DISTRIBUTE_COMPILE_FLAGS + "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") +endif() +set_source_files_properties(heter_wrapper.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +endif() + +cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto +device_context heter_service_proto ${BRPC_DEPS}) cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell) -if(WITH_ASCEND) - cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend ascend_graph) -endif(WITH_ASCEND) +if(WITH_ASCEND OR WITH_ASCEND_CL) + cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend_ge ascend_graph) +endif() diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.cc b/paddle/fluid/framework/fleet/ascend_wrapper.cc index d1b2f51f700363cf319344ab35b10af545c0373a..273939f6bee613f44353939435a71e571074d4f2 100644 --- a/paddle/fluid/framework/fleet/ascend_wrapper.cc +++ b/paddle/fluid/framework/fleet/ascend_wrapper.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifdef PADDLE_WITH_ASCEND +#ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/framework/fleet/ascend_wrapper.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h index da79fccb8ca69fac0f34f8092f296b9923e5f849..f749ee8cfa0baa410e3b6c4b67de07bdd30611ab 100644 --- a/paddle/fluid/framework/fleet/ascend_wrapper.h +++ b/paddle/fluid/framework/fleet/ascend_wrapper.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#ifdef PADDLE_WITH_ASCEND +#ifdef PADDLE_WITH_ASCEND_CL #include #include @@ -29,7 +29,6 @@ limitations under the License. */ #include "paddle/fluid/platform/timer.h" #include "ge/ge_api.h" -#include "ge/ge_api_types.h" #include "graph/attr_value.h" #include "graph/tensor.h" #include "graph/types.h" @@ -37,25 +36,50 @@ limitations under the License. */ namespace paddle { namespace framework { -// typedef std::vector AscendGraphDesc; typedef ge::Graph AscendGraphDesc; +#ifdef PADDLE_WITH_ASCEND_STRING +using AscendString = ge::AscendString; +#else +using AscendString = std::string; +#endif + class AscendInstance { public: virtual ~AscendInstance() {} AscendInstance() {} - std::map GetDefaultInitSessionOptions() { - std::map init_options; - init_options["a"] = "b"; - init_options["ge.trainFlag"] = "1"; + std::map _GetDefaultInitOptions() { + std::map init_options; + init_options["ge.exec.deviceId"] = "0"; + init_options["ge.graphRunMode"] = "1"; + return init_options; + } + + std::map _GetDefaultInitSessionOptions() { + std::map init_options; + // init_options["a"] = "b"; + // init_options["ge.trainFlag"] = "1"; return init_options; } - // add other parameters here to init + ge::Status InitGEForUT() { + return ge::GEInitialize(_GetDefaultInitOptions()); + } + void InitGlobalResouces() { - session_.reset(new ge::Session(GetDefaultInitSessionOptions())); - VLOG(1) << "InitGlobalResouces Done"; + LOG(INFO) << "Begin ascend InitGlobalResouces"; + session_.reset(new ge::Session(_GetDefaultInitSessionOptions())); + if (session_ == nullptr) { + PADDLE_THROW(platform::errors::Fatal("new session error: nullptr")); + } + LOG(INFO) << "End ascend InitGlobalResouces"; + } + + void DestroyGlobalResouces() { + LOG(INFO) << "Begin ascend DestroyGlobalResouces"; + session_ = nullptr; + LOG(INFO) << "Begin ascend DestroyGlobalResouces"; } static std::shared_ptr GetInstance() { @@ -178,6 +202,6 @@ class AscendInstance { private: static std::shared_ptr ascend_instance_; }; -} // end namespace framework -} // end namespace paddle +} // namespace framework +} // namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index e584fb5e2b9ca77923161c8c89c2e7784c5d164b..09f7801b19f988bb7c0948b127b79e6d848629be 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -28,12 +28,15 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/heter_service.h" +#include "paddle/fluid/framework/heter_util.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN +#ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/platform/type_defs.h" +#endif namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h index a02931b3f5c28a6e8e09866f3352109b7fe91adb..1fb2f0fab4aff9926f6ed6c30fa72cb9e9c93cf6 100644 --- a/paddle/fluid/framework/fleet/heter_context.h +++ b/paddle/fluid/framework/fleet/heter_context.h @@ -14,15 +14,21 @@ limitations under the License. */ #pragma once -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ - (defined PADDLE_WITH_PSLIB) +#ifdef PADDLE_WITH_HETERPS #include #include #include #include +#ifdef PADDLE_WITH_PSLIB #include "common_value.h" // NOLINT +#endif + +#ifdef PADDLE_WITH_PSCORE +#include "paddle/fluid/distributed/table/depends/large_scale_kv.h" +#endif + #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/scope.h" @@ -39,7 +45,12 @@ class HeterContext { } Scope* scope_{nullptr}; std::vector> feature_keys_; +#ifdef PADDLE_WITH_PSLIB std::vector> value_ptr_; +#endif +#ifdef PADDLE_WITH_PSCORE + std::vector> value_ptr_; +#endif std::vector> device_values_; std::vector> device_keys_; std::vector mutex_; @@ -66,6 +77,21 @@ class HeterContext { mutex_[i] = new std::mutex(); } } + + void Reset() { + for (size_t i = 0; i < feature_keys_.size(); ++i) { + feature_keys_[i].clear(); + } + for (size_t i = 0; i < value_ptr_.size(); ++i) { + value_ptr_[i].clear(); + } + for (size_t i = 0; i < device_values_.size(); ++i) { + device_values_[i].clear(); + } + for (size_t i = 0; i < device_keys_.size(); ++i) { + device_keys_[i].clear(); + } + } void batch_add_keys( const std::vector>& thread_keys) { assert(thread_keys.size() == feature_keys_.size()); @@ -79,6 +105,15 @@ class HeterContext { } } + void batch_add_keys(int shard_num, + const std::unordered_set& shard_keys) { + int idx = feature_keys_[shard_num].size(); + feature_keys_[shard_num].resize(feature_keys_[shard_num].size() + + shard_keys.size()); + std::copy(shard_keys.begin(), shard_keys.end(), + feature_keys_[shard_num].begin() + idx); + } + void UniqueKeys() { std::vector threads; auto unique_func = [this](int i) { diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index 6df2cd52bb401d3cc378c2776073471070f1e411..67c44368b7aea471a4e73eaf6132c85883068d58 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -1,5 +1,13 @@ IF(WITH_GPU) - nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context) + SET(HETERPS_DEPS device_context) + if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) + SET(HETERPS_DEPS ${HETERPS_DEPS} cub) + endif() + if(WITH_PSCORE) + get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) + SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS}) + endif() + nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS ${HETERPS_DEPS}) nv_test(test_heter_comm SRCS test_heter_comm.cu feature_value.h DEPS heter_comm) nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm) ENDIF() diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h index 698ece09de6c50781662659e317f4b1fc8f340b1..c3bf33b32c2daf298ddc9af546c4c047bf6e9a6e 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS #include diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h index e5c0972763bede000961e970390c64431ac3cb22..3782e14ad41a5ed6ce5ef1eb0788842d03ecddc7 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -17,11 +17,17 @@ limitations under the License. */ #include #include #include +#ifdef PADDLE_WITH_PSLIB #include "common_value.h" // NOLINT +#endif +#ifdef PADDLE_WITH_PSCORE +#include "paddle/fluid/distributed/table/depends/large_scale_kv.h" +#endif #include "thrust/pair.h" //#include "cudf/concurrent_unordered_map.cuh.h" #include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/platform/type_defs.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h index 871f9c7857af46d8aad7cfbfafcdc80f0f52f259..098c795fc7e1f968491a05a241aa397b53eea7f9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { @@ -119,6 +119,7 @@ void HashTable::dump_to_cpu(int devid, cudaStream_t stream) { continue; } ValType& gpu_val = kv[i].second; +#ifdef PADDLE_WITH_PSLIB auto* downpour_value = (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr); int downpour_value_size = downpour_value->size(); @@ -138,6 +139,14 @@ void HashTable::dump_to_cpu(int devid, cudaStream_t stream) { cpu_val[x + 7] = gpu_val.mf[x]; } } +#endif +#ifdef PADDLE_WITH_PSCORE + auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr); + downpour_value->count_ = gpu_val.show; + for (int x = 0; x < gpu_val.mf_size; x++) { + downpour_value->data_[x] = gpu_val.mf[x]; + } +#endif } container_->prefetch(devid, stream); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h index 0e38ebbd7f4e7280d5571ffd216143b277f063d6..2ec2a8a1f1e223c85ffbf26ecbaccc434ca055f7 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h @@ -25,7 +25,7 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "thrust/pair.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { @@ -182,7 +182,7 @@ class HeterComm { std::vector> path_; std::vector storage_; int feanum_{1800 * 2048}; - int multi_node_{1}; + int multi_node_{0}; std::vector nccl_inner_comms_; std::vector nccl_inter_comms_; int node_size_; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 2f1c809c01eaadcad8c3406e882acafaadb09134..1b4205e3c38fe27419c4ba42e6950b581db62a99 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#ifdef PADDLE_WITH_HETERPS #include -#ifdef PADDLE_WITH_PSLIB namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu index f2e129ded9fefc58e40e477142cd56de2c0a3448..581b0d511c23ee070b6dc33af315cc420f6ef20a 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { @@ -54,8 +54,8 @@ void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } void HeterPs::push_sparse(int num, FeatureKey* d_keys, FeaturePushValue* d_grads, size_t len) { - // comm_->push_sparse(num, d_keys, d_grads, len, opt_); - comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_); + comm_->push_sparse(num, d_keys, d_grads, len, opt_); + // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_); } void HeterPs::set_nccl_comm_and_size(const std::vector& inner_comms, diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h index 142f4a93b93a29410f02ef32fb7d7bd08fb6654f..d78b6b492074deb9f3dcc1073e951353c0846abb 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h" #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h index 7980220eab9b9b6f36d6449c5f553daeee2e36f3..05b3ecf9c3c12c6b4df1192785b0659a8ef851d0 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc index f65b664f83ba0dd3a383d9443d67679cef3a509c..0f2af2a522e287dd9361ba8448441a41b7293794 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc +++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS #include "heter_resource.h" #include "paddle/fluid/platform/cuda_device_guard.h" diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h index ad7649a8a33cb77e0581429d1da202b2d218b0dc..7b23379994c735eb1f8ad9d5c5d787388d08ce8d 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/enforce.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h index b3ec9e752e62bb01a73f2d2070f94a47f8fe0730..7e82a8e014fd3cb33b706c9fc5c1e671392e05a7 100644 --- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h +++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h @@ -18,7 +18,7 @@ limitations under the License. */ #include "optimizer_conf.h" #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_wrapper.h b/paddle/fluid/framework/fleet/heter_wrapper.h index 871d2e251b41016d548fa1e257560aca9db030d7..4e529de077593777c1ab326db395febaefb9564a 100644 --- a/paddle/fluid/framework/fleet/heter_wrapper.h +++ b/paddle/fluid/framework/fleet/heter_wrapper.h @@ -25,6 +25,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_PSLIB #include "paddle/fluid/framework/heter_service.h" +#include "paddle/fluid/framework/heter_util.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable_helper.h" diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 4274876c9975e5b16824af4c799bf81228659d92..67ff6b6acaefb26adc1389559a763b98f41a533a 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -26,8 +26,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ - (defined PADDLE_WITH_PSLIB) +#ifdef PADDLE_WITH_HETERPS #include #include @@ -58,7 +57,12 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, auto& device_mutex = gpu_task->mutex_; std::vector threads; +#ifdef PADDLE_WITH_PSLIB auto fleet_ptr = FleetWrapper::GetInstance(); +#endif +#ifdef PADDLE_WITH_PSCORE + auto fleet_ptr = paddle::distributed::Communicator::GetInstance(); +#endif // data should be in input channel thread_keys_.resize(thread_keys_thread_num_); @@ -99,12 +103,26 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, timeline.Start(); + threads.clear(); // merge thread_keys to shard_keys - for (size_t i = 0; i < thread_keys_.size(); i++) { - gpu_task->batch_add_keys(thread_keys_[i]); - for (int j = 0; j < thread_keys_thread_num_; j++) { - thread_keys_[i][j].clear(); + auto merge_ins_func = [this, gpu_task](int shard_num) { + for (int i = 0; i < thread_keys_thread_num_; ++i) { + gpu_task->batch_add_keys(shard_num, thread_keys_[i][shard_num]); + thread_keys_[i][shard_num].clear(); } + }; + + // for (size_t i = 0; i < thread_keys_.size(); i++) { + // gpu_task->batch_add_keys(thread_keys_[i]); + // for (int j = 0; j < thread_keys_thread_num_; j++) { + // thread_keys_[i][j].clear(); + // } + //} + for (int i = 0; i < thread_keys_shard_num_; ++i) { + threads.push_back(std::thread(merge_ins_func, i)); + } + for (auto& t : threads) { + t.join(); } timeline.Pause(); @@ -124,9 +142,16 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, auto ptl_func = [this, &local_keys, &local_ptr, &table_id, &fleet_ptr](int i) { size_t key_size = local_keys[i].size(); +#ifdef PADDLE_WITH_PSLIB auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( reinterpret_cast(local_ptr[i].data()), table_id, local_keys[i].data(), key_size); +#endif +#ifdef PADDLE_WITH_PSCORE + auto tt = fleet_ptr->_worker_ptr->pull_sparse_ptr( + reinterpret_cast(local_ptr[i].data()), table_id, + local_keys[i].data(), key_size); +#endif tt.wait(); auto status = tt.get(); // auto status = 0; @@ -153,8 +178,14 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, auto build_func = [device_num, &local_keys, &local_ptr, &device_keys, &device_vals, &device_mutex](int i) { std::vector> task_keys(device_num); +#ifdef PADDLE_WITH_PSLIB std::vector> task_ptrs( device_num); +#endif + +#ifdef PADDLE_WITH_PSCORE + std::vector> task_ptrs(device_num); +#endif for (size_t j = 0; j < local_keys[i].size(); j++) { int shard = local_keys[i][j] % device_num; @@ -169,7 +200,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, int cur = device_keys[dev].size(); device_keys[dev].resize(device_keys[dev].size() + len); device_vals[dev].resize(device_vals[dev].size() + len); - +#ifdef PADDLE_WITH_PSLIB for (int j = 0; j < len; ++j) { device_keys[dev][cur + j] = task_keys[dev][j]; float* ptr_val = task_ptrs[dev][j]->data(); @@ -196,6 +227,35 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, } } } +#endif +#ifdef PADDLE_WITH_PSCORE + for (int j = 0; j < len; ++j) { + device_keys[dev][cur + j] = task_keys[dev][j]; + distributed::VALUE* ptr_val = task_ptrs[dev][j]; + FeatureValue& val = device_vals[dev][cur + j]; + bool has_mf = 1; + val.delta_score = 0; + val.show = ptr_val->count_; + val.clk = 0; + val.slot = 0; + val.lr = 0; + val.lr_g2sum = 0; + val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]); + + if (has_mf) { + val.mf_size = MF_DIM + 1; + for (int x = 0; x < val.mf_size; x++) { + val.mf[x] = ptr_val->data_[x]; + } + } else { + val.mf_size = 0; + for (int x = 0; x < MF_DIM + 1; x++) { + val.mf[x] = 0; + } + } + } +#endif + VLOG(1) << "GpuPs build hbmps done"; device_mutex[dev]->unlock(); } @@ -215,6 +275,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) { int device_num = heter_devices_.size(); std::shared_ptr gpu_task = gpu_task_pool_.Get(); + gpu_task->Reset(); BuildTask(gpu_task, table_id, feature_dim); platform::Timer timeline; timeline.Start(); @@ -227,8 +288,8 @@ void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) { size_max = std::max(size_max, feature_keys_count[i]); } if (HeterPs_) { - HeterPs_->show_one_table(0); - return; + delete HeterPs_; + HeterPs_ = nullptr; } std::vector threads(device_num); HeterPs_ = HeterPsBase::get_instance(size_max, resource_); @@ -249,6 +310,7 @@ void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) { timeline.Pause(); VLOG(1) << "GpuPs build table total costs: " << timeline.ElapsedSec() << " s."; + gpu_task_pool_.Push(gpu_task); } void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index 2eedcd5f1c70052533243540e0ca432dea24725d..2bf564d3f76d5a7039bf8ff03a00b66abfb84171 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS #include #include #include diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index ef586b41fe05d2f21e1469dcd7bcce3d77fc9651..cfb23d1be2acfed0a878cb3bffa241afa2cf3de8 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -14,8 +14,7 @@ limitations under the License. */ #pragma once -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ - (defined PADDLE_WITH_PSLIB) +#ifdef PADDLE_WITH_HETERPS #include #include @@ -26,7 +25,6 @@ limitations under the License. */ #include #include #include - #ifdef PADDLE_WITH_GLOO #include #include "paddle/fluid/framework/fleet/gloo_wrapper.h" @@ -42,6 +40,9 @@ limitations under the License. */ #include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN #include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_PSCORE +#include "paddle/fluid/distributed/service/communicator.h" +#endif namespace paddle { namespace framework { @@ -219,7 +220,7 @@ class PSGPUWrapper { std::shared_ptr resource_; int32_t sleep_seconds_before_fail_exit_; std::vector slot_vector_; - int multi_node_{1}; + int multi_node_{0}; int node_size_; std::vector inner_comms_; std::vector inter_comms_; diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index c8b6c7642551756273ebecb83093e1d75d131f2c..9ab6b5d8c178b9272241ce8db600f3ea32276988 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -86,8 +86,9 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream_)); #else PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_)); + callback_manager_.reset( + new platform::StreamCallbackManager(stream_)); #endif - callback_manager_.reset(new platform::StreamCallbackManager(stream_)); } StreamGarbageCollector::~StreamGarbageCollector() { @@ -121,6 +122,32 @@ void CUDAPinnedGarbageCollector::ClearCallback( } #endif +#ifdef PADDLE_WITH_ASCEND_CL +NPUDefaultStreamGarbageCollector::NPUDefaultStreamGarbageCollector( + const platform::NPUPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void NPUDefaultStreamGarbageCollector::Wait() const { + static_cast(this->dev_ctx_) + ->WaitStreamCallback(); +} + +void NPUDefaultStreamGarbageCollector::ClearCallback( + const std::function &callback) { + static_cast(this->dev_ctx_) + ->AddStreamCallback(callback); +} +NPUUnsafeFastGarbageCollector::NPUUnsafeFastGarbageCollector( + const platform::NPUPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void NPUUnsafeFastGarbageCollector::ClearCallback( + const std::function &callback) { + callback(); +} + +#endif + int64_t GetEagerDeletionThreshold() { return FLAGS_eager_delete_tensor_gb < 0 ? -1 diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index 97800865af861f6598a3e74456deef1d0c355786..2c2b57bbe420a84ed6a9c7250d4246e677bcf88a 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -117,7 +117,8 @@ class StreamGarbageCollector : public GarbageCollector { private: gpuStream_t stream_; - std::unique_ptr callback_manager_; + std::unique_ptr> + callback_manager_; }; class CUDAPinnedGarbageCollector : public GarbageCollector { @@ -130,6 +131,28 @@ class CUDAPinnedGarbageCollector : public GarbageCollector { }; #endif +#ifdef PADDLE_WITH_ASCEND_CL +class NPUDefaultStreamGarbageCollector : public GarbageCollector { + public: + NPUDefaultStreamGarbageCollector(const platform::NPUPlace &place, + size_t max_memory_size); + + void Wait() const override; + + protected: + void ClearCallback(const std::function &callback) override; +}; + +class NPUUnsafeFastGarbageCollector : public GarbageCollector { + public: + NPUUnsafeFastGarbageCollector(const platform::NPUPlace &place, + size_t max_memory_size); + + protected: + void ClearCallback(const std::function &callback) override; +}; +#endif + template void GarbageCollector::Add(Container &&objs) { Add(std::forward(objs), []() {}); diff --git a/paddle/fluid/framework/heter_service.h b/paddle/fluid/framework/heter_service.h index 8f52235c962445b2d493f79d178a8e404afdf343..7e5bf138d9fa9270eef7b19e0b350301a2290ab7 100644 --- a/paddle/fluid/framework/heter_service.h +++ b/paddle/fluid/framework/heter_service.h @@ -30,10 +30,12 @@ limitations under the License. */ #include "brpc/controller.h" #include "brpc/server.h" #include "paddle/fluid/platform/timer.h" +#endif namespace paddle { namespace framework { +#ifdef PADDLE_WITH_PSLIB typedef std::function HeterServiceHandler; class DataFeed; @@ -70,308 +72,7 @@ class HeterXpuService : public HeterService { std::unordered_map handler_map_; }; -enum HeterTaskState { PULL_SPARSE, OP_RUN, XPU, OP_RUN_END, PUSH_GRAD, DONE }; - -class HeterTask { - public: - void Update() { - if (state_ == PULL_SPARSE) { - state_ = OP_RUN; - } else if (state_ == OP_RUN) { - state_ = XPU; - // state_ = PUSH_GRAD; - // state_ = PUSH_GRAD; - } else if (state_ == XPU) { - state_ = OP_RUN_END; - } else if (state_ == OP_RUN_END) { - state_ = PUSH_GRAD; - } else if (state_ == PUSH_GRAD) { - state_ = DONE; - } - } - void Reset() { - total_time = 0; - read_time = 0; - pack_time = 0; - pull_sparse_local_time = 0; - op_all_time = 0; - xpu_op_time = 0; - xpu_wait_time = 0; - cpu_op_time = 0; - collect_label_time = 0; - fill_sparse_time = 0; - push_sparse_time = 0; - gpu_2_cpu_time = 0; - cpu_2_gpu_time = 0; - timeline.Reset(); - } - void Show() { - std::cout << "features size " << features_.size() << std::endl; - for (size_t i = 0; i < features_.size(); ++i) { - std::cout << "features[" << i << "] size " << features_[i].size() - << std::endl; - } - } - void PackTask(Scope* scope, int taskid, DataFeed* reader, int cur_batch, - const ProgramDesc& program); - void PackGpuTask(Scope* thread_scope, DataFeed* reader, - const ProgramDesc& program); - - Scope* scope_{nullptr}; - int taskid_; - int cur_batch_; - HeterTaskState state_; - // cache - std::map> features_; - std::map> feature_labels_; - std::map>> feature_values_; - std::map>> feature_grads_; - std::map> sparse_push_keys_; - double total_time{0}; - double read_time{0}; - double pack_time{0}; - double pull_sparse_local_time{0}; - double op_all_time{0}; - double xpu_op_time{0}; - double xpu_wait_time{0}; - double cpu_op_time{0}; - double collect_label_time{0}; - double fill_sparse_time{0}; - double push_sparse_time{0}; - double gpu_2_cpu_time{0}; - double cpu_2_gpu_time{0}; - platform::Timer timeline; -}; - -template -class HeterObjectPool { - public: - HeterObjectPool() {} - virtual ~HeterObjectPool(){}; - std::shared_ptr Get() { - std::lock_guard lock(mutex_); - if (pool_.empty()) { - num_ += 1; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - VLOG(0) << "pool construct size: " << num_; #endif - return std::make_shared(); - } else { - auto ret = pool_.back(); - pool_.pop_back(); - return ret; - } - } - void Push(std::shared_ptr data) { - std::lock_guard lock(mutex_); - pool_.push_back(std::move(data)); - } - int Size() { - std::lock_guard lock(mutex_); - return pool_.size(); - } - std::shared_ptr& GetElement(int i) { return pool_[i]; } - - private: - std::vector> pool_; - std::mutex mutex_; - int num_{0}; -}; - -struct BthreadMutextGuard { - BthreadMutextGuard(bthread_mutex_t* rho) { - mutex_ = rho; - bthread_mutex_lock(mutex_); - } - ~BthreadMutextGuard() { bthread_mutex_unlock(mutex_); } - bthread_mutex_t* mutex_; -}; - -template -class BtObjectPool { - public: - BtObjectPool() { - bthread_mutex_init(&mutex_, NULL); - bthread_cond_init(&cond_, NULL); - } - - virtual ~BtObjectPool() { - bthread_cond_destroy(&cond_); - bthread_mutex_destroy(&mutex_); - }; - - std::shared_ptr Get() { - BthreadMutextGuard guard(&mutex_); - while (pool_.empty()) { - bthread_cond_wait(&cond_, &mutex_); - } - auto ret = pool_.back(); - pool_.pop_back(); - return ret; - } - - void Push(std::shared_ptr data) { - BthreadMutextGuard guard(&mutex_); - pool_.push_back(std::move(data)); - bthread_cond_signal(&cond_); - } - - int Size() { return pool_.size(); } - - std::shared_ptr& GetElement(int i) { return pool_[i]; } - - private: - std::vector> pool_; - bthread_mutex_t mutex_; - bthread_cond_t cond_; - int num_{0}; -}; - -template -struct HeterNode { - K key; - T value; - HeterNode* prev; - HeterNode* next; -}; - -template -class HeterList { - public: - HeterList() : head_(new HeterNode), tail_(new HeterNode) { - head_->prev = NULL; - head_->next = tail_; - tail_->prev = head_; - tail_->next = NULL; - size = 0; - cap_ = 1e9; - } - - ~HeterList() { - delete head_; - delete tail_; - } - - void SetCap(int num) { cap_ = num; } - - bool TryPut(K& key, T& value) { - std::unique_lock lock(mutex_); - cond_.wait(lock, [this] { return size < cap_; }); - if (task_map_.find(key) != task_map_.end()) { - // std::cout << "try put key=" << key << " false" << std::endl; - task_map_.erase(key); - return false; - } else { - HeterNode* node = new HeterNode; - node->key = key; - node->value = value; - map_[node->key] = node; - attach(node); - // std::cout << "try put key=" << key << " true" << std::endl; - return true; - } - } - - bool Put(K& key, T& value) { - std::unique_lock lock(mutex_); - cond_.wait(lock, [this] { return size < cap_; }); - HeterNode* node = new HeterNode; - // std::cout << "put key=" << key << " true" << std::endl; - node->key = key; - node->value = value; - map_[node->key] = node; - attach(node); - return true; - } - - T TryGet(const K& key) { - std::lock_guard lock(mutex_); - auto iter = map_.find(key); - if (iter != map_.end()) { - // std::cout << "try get key=" << key << " true" << std::endl; - HeterNode* node = iter->second; - detach(node); - cond_.notify_one(); - T ret = std::move(node->value); - map_.erase(key); - delete node; - return ret; - } - task_map_.insert(key); - // std::cout << "try get key=" << key << " false" << std::endl; - return nullptr; - } - - T Get(const K& key) { - std::lock_guard lock(mutex_); - auto iter = map_.find(key); - if (iter != map_.end()) { - // std::cout << "get key=" << key << " true" << std::endl; - HeterNode* node = iter->second; - detach(node); - cond_.notify_one(); - T ret = std::move(node->value); - map_.erase(key); - delete node; - return ret; - } - // std::cout << "get key=" << key << " false" << std::endl; - return nullptr; - } - - T Get() { - std::lock_guard lock(mutex_); - HeterNode* node = head_->next; - if (node == tail_) { - // std::cout << "get2 false" << std::endl; - return nullptr; - } else { - detach(node); - cond_.notify_one(); - T ret = std::move(node->value); - map_.erase(node->key); - // std::cout << "get2 key=" << node->key << " true" << std::endl; - delete node; - return ret; - } - } - - bool Empty() { - std::lock_guard lock(mutex_); - return head_->next == tail_; - } - - int Size() { - std::lock_guard lock(mutex_); - return size; - } - - private: - void detach(HeterNode* node) { - node->prev->next = node->next; - node->next->prev = node->prev; - size--; - } - - void attach(HeterNode* node) { - node->prev = head_; - node->next = head_->next; - head_->next->prev = node; - head_->next = node; - size++; - } - - private: - HeterNode* head_; - HeterNode* tail_; - std::unordered_map*> map_; - std::unordered_set task_map_; - std::mutex mutex_; - std::condition_variable cond_; - int cap_; - int size; -}; } // namespace framework } // namespace paddle -#endif diff --git a/paddle/fluid/framework/heter_util.h b/paddle/fluid/framework/heter_util.h new file mode 100644 index 0000000000000000000000000000000000000000..a08f08428da346b4338ff4d7b8cc16e25118f909 --- /dev/null +++ b/paddle/fluid/framework/heter_util.h @@ -0,0 +1,329 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_PSLIB +#include +#include +#include // NOLINT +#include +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include +#include "bthread/bthread.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/timer.h" + +namespace paddle { +namespace framework { +class DataFeed; +enum HeterTaskState { PULL_SPARSE, OP_RUN, XPU, OP_RUN_END, PUSH_GRAD, DONE }; + +class HeterTask { + public: + HeterTask() {} + virtual ~HeterTask(){}; + + void Update() { + if (state_ == PULL_SPARSE) { + state_ = OP_RUN; + } else if (state_ == OP_RUN) { + state_ = XPU; + // state_ = PUSH_GRAD; + // state_ = PUSH_GRAD; + } else if (state_ == XPU) { + state_ = OP_RUN_END; + } else if (state_ == OP_RUN_END) { + state_ = PUSH_GRAD; + } else if (state_ == PUSH_GRAD) { + state_ = DONE; + } + } + void Reset() { + total_time = 0; + read_time = 0; + pack_time = 0; + pull_sparse_local_time = 0; + op_all_time = 0; + xpu_op_time = 0; + xpu_wait_time = 0; + cpu_op_time = 0; + collect_label_time = 0; + fill_sparse_time = 0; + push_sparse_time = 0; + gpu_2_cpu_time = 0; + cpu_2_gpu_time = 0; + timeline.Reset(); + } + void Show() { + std::cout << "features size " << features_.size() << std::endl; + for (size_t i = 0; i < features_.size(); ++i) { + std::cout << "features[" << i << "] size " << features_[i].size() + << std::endl; + } + } + void PackTask(Scope* scope, int taskid, DataFeed* reader, int cur_batch, + const ProgramDesc& program); + void PackGpuTask(Scope* thread_scope, DataFeed* reader, + const ProgramDesc& program); + + Scope* scope_{nullptr}; + int taskid_; + int cur_batch_; + HeterTaskState state_; + // cache + std::map> features_; + std::map> feature_labels_; + std::map>> feature_values_; + std::map>> feature_grads_; + std::map> sparse_push_keys_; + double total_time{0}; + double read_time{0}; + double pack_time{0}; + double pull_sparse_local_time{0}; + double op_all_time{0}; + double xpu_op_time{0}; + double xpu_wait_time{0}; + double cpu_op_time{0}; + double collect_label_time{0}; + double fill_sparse_time{0}; + double push_sparse_time{0}; + double gpu_2_cpu_time{0}; + double cpu_2_gpu_time{0}; + platform::Timer timeline; +}; +#endif +template +class HeterObjectPool { + public: + HeterObjectPool() {} + virtual ~HeterObjectPool(){}; + std::shared_ptr Get() { + std::lock_guard lock(mutex_); + if (pool_.empty()) { + num_ += 1; + return std::make_shared(); + } else { + auto ret = pool_.back(); + pool_.pop_back(); + return ret; + } + } + void Push(std::shared_ptr data) { + std::lock_guard lock(mutex_); + pool_.push_back(std::move(data)); + } + int Size() { + std::lock_guard lock(mutex_); + return pool_.size(); + } + std::shared_ptr& GetElement(int i) { return pool_[i]; } + + private: + std::vector> pool_; + std::mutex mutex_; + int num_{0}; +}; + +#ifdef PADDLE_WITH_PSLIB +struct BthreadMutextGuard { + BthreadMutextGuard(bthread_mutex_t* rho) { + mutex_ = rho; + bthread_mutex_lock(mutex_); + } + ~BthreadMutextGuard() { bthread_mutex_unlock(mutex_); } + bthread_mutex_t* mutex_; +}; + +template +class BtObjectPool { + public: + BtObjectPool() { + bthread_mutex_init(&mutex_, NULL); + bthread_cond_init(&cond_, NULL); + } + + virtual ~BtObjectPool() { + bthread_cond_destroy(&cond_); + bthread_mutex_destroy(&mutex_); + }; + + std::shared_ptr Get() { + BthreadMutextGuard guard(&mutex_); + while (pool_.empty()) { + bthread_cond_wait(&cond_, &mutex_); + } + auto ret = pool_.back(); + pool_.pop_back(); + return ret; + } + + void Push(std::shared_ptr data) { + BthreadMutextGuard guard(&mutex_); + pool_.push_back(std::move(data)); + bthread_cond_signal(&cond_); + } + + int Size() { return pool_.size(); } + + std::shared_ptr& GetElement(int i) { return pool_[i]; } + + private: + std::vector> pool_; + bthread_mutex_t mutex_; + bthread_cond_t cond_; + int num_{0}; +}; + +template +struct HeterNode { + K key; + T value; + HeterNode* prev; + HeterNode* next; +}; + +template +class HeterList { + public: + HeterList() : head_(new HeterNode), tail_(new HeterNode) { + head_->prev = NULL; + head_->next = tail_; + tail_->prev = head_; + tail_->next = NULL; + size = 0; + cap_ = 1e9; + } + + ~HeterList() { + delete head_; + delete tail_; + } + + void SetCap(int num) { cap_ = num; } + + bool TryPut(K& key, T& value) { + std::unique_lock lock(mutex_); + cond_.wait(lock, [this] { return size < cap_; }); + if (task_map_.find(key) != task_map_.end()) { + task_map_.erase(key); + return false; + } else { + HeterNode* node = new HeterNode; + node->key = key; + node->value = value; + map_[node->key] = node; + attach(node); + return true; + } + } + + bool Put(K& key, T& value) { + std::unique_lock lock(mutex_); + cond_.wait(lock, [this] { return size < cap_; }); + HeterNode* node = new HeterNode; + node->key = key; + node->value = value; + map_[node->key] = node; + attach(node); + return true; + } + + T TryGet(const K& key) { + std::lock_guard lock(mutex_); + auto iter = map_.find(key); + if (iter != map_.end()) { + HeterNode* node = iter->second; + detach(node); + cond_.notify_one(); + T ret = std::move(node->value); + map_.erase(key); + delete node; + return ret; + } + task_map_.insert(key); + return nullptr; + } + + T Get(const K& key) { + std::lock_guard lock(mutex_); + auto iter = map_.find(key); + if (iter != map_.end()) { + HeterNode* node = iter->second; + detach(node); + cond_.notify_one(); + T ret = std::move(node->value); + map_.erase(key); + delete node; + return ret; + } + return nullptr; + } + + T Get() { + std::lock_guard lock(mutex_); + HeterNode* node = head_->next; + if (node == tail_) { + return nullptr; + } else { + detach(node); + cond_.notify_one(); + T ret = std::move(node->value); + map_.erase(node->key); + delete node; + return ret; + } + } + + bool Empty() { + std::lock_guard lock(mutex_); + return head_->next == tail_; + } + + int Size() { + std::lock_guard lock(mutex_); + return size; + } + + private: + void detach(HeterNode* node) { + node->prev->next = node->next; + node->next->prev = node->prev; + size--; + } + + void attach(HeterNode* node) { + node->prev = head_; + node->next = head_->next; + head_->next->prev = node; + head_->next = node; + size++; + } + + private: + HeterNode* head_; + HeterNode* tail_; + std::unordered_map*> map_; + std::unordered_set task_map_; + std::mutex mutex_; + std::condition_variable cond_; + int cap_; + int size; +}; +} // namespace framework +} // namespace paddle +#endif diff --git a/paddle/fluid/framework/heterbox_worker.cc b/paddle/fluid/framework/heterbox_worker.cc index 726b651fcf4ec7409eee7d1893803ef67d87db7f..b7df88218cbd4dd9018e49d709922cca3b287678 100644 --- a/paddle/fluid/framework/heterbox_worker.cc +++ b/paddle/fluid/framework/heterbox_worker.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/fleet/fleet_wrapper.h" -#include "paddle/fluid/framework/fleet/heter_wrapper.h" +#include "paddle/fluid/framework/heter_util.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/string/string_helper.h" diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc index 5e1fabf2038cc26d4da555b712cbb3199854d686..8049a1c9424bebf271f55c1247f1277a0836d88d 100644 --- a/paddle/fluid/framework/heterxpu_trainer.cc +++ b/paddle/fluid/framework/heterxpu_trainer.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/fleet/fleet_wrapper.h" +#include "paddle/fluid/framework/fleet/heter_wrapper.h" #include "paddle/fluid/framework/trainer.h" #if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \ (defined PADDLE_WITH_PSLIB) diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc index 5de8d0fcdde91993d69c16cea805a7e4042c9645..845cdbdb2500370d41a63eeb3209b347c414ac10 100644 --- a/paddle/fluid/framework/hogwild_worker.cc +++ b/paddle/fluid/framework/hogwild_worker.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h" @@ -38,6 +39,9 @@ void HogwildWorker::Initialize(const TrainerDesc &desc) { for (int i = 0; i < param_.stat_var_names_size(); ++i) { stat_var_name_map_[param_.stat_var_names(i)] = 1; } +#ifdef PADDLE_WITH_HETERPS + dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_); +#endif } void HogwildWorker::CreateThreadOperators(const ProgramDesc &program) { @@ -149,6 +153,9 @@ void HogwildWorker::TrainFilesWithProfiler() { VLOG(3) << "Going to run op " << op_name[i]; if (!need_skip) { ops_[i]->Run(*thread_scope_, place_); +#ifdef PADDLE_WITH_HETERPS + dev_ctx_->Wait(); +#endif } VLOG(3) << "Op " << op_name[i] << " Finished"; timeline.Pause(); @@ -166,6 +173,16 @@ void HogwildWorker::TrainFilesWithProfiler() { total_inst += cur_batch; ++batch_cnt; PrintFetchVars(); +#ifdef PADDLE_WITH_HETERPS + dev_ctx_->Wait(); + VLOG(1) << "GpuPs worker " << thread_id_ << " train cost " << total_time + << " seconds, ins_num: " << total_inst; + for (size_t i = 0; i < op_name.size(); ++i) { + VLOG(1) << "card:" << thread_id_ << ", op: " << op_name[i] + << ", mean time: " << op_total_time[i] / total_inst + << "s, totol time:" << op_total_time[i] << "sec"; + } +#else if (thread_id_ == 0) { if (batch_cnt > 0 && batch_cnt % 100 == 0) { for (size_t i = 0; i < ops_.size(); ++i) { @@ -177,6 +194,7 @@ void HogwildWorker::TrainFilesWithProfiler() { fprintf(stderr, "%6.2f instances/s\n", total_inst / total_time); } } +#endif thread_scope_->DropKids(); timeline.Start(); } @@ -194,8 +212,10 @@ void HogwildWorker::TrainFilesWithProfiler() { void HogwildWorker::TrainFiles() { platform::SetNumThreads(1); - - std::cerr << "1!!!!!" << std::endl; + platform::Timer timeline; + timeline.Start(); + + int total_ins_num = 0; // how to accumulate fetched values here device_reader_->Start(); int cur_batch; @@ -215,10 +235,13 @@ void HogwildWorker::TrainFiles() { } } + total_ins_num += cur_batch; PrintFetchVars(); thread_scope_->DropKids(); } - std::cerr << "total bacth " << i << std::endl; + timeline.Pause(); + VLOG(3) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec() + << " seconds, ins_num: " << total_ins_num; #if defined PADDLE_WITH_PSCORE if (thread_barrier_) { paddle::distributed::Communicator::GetInstance()->BarrierTriggerDecrement(); @@ -230,14 +253,32 @@ void HogwildWorker::PrintFetchVars() { // call count batch_num_++; int batch_per_print = fetch_config_.print_period(); - if (thread_id_ == 0) { - if (batch_num_ % batch_per_print == 0) { - int fetch_var_num = fetch_config_.fetch_var_names_size(); - for (int i = 0; i < fetch_var_num; ++i) { - platform::PrintVar(thread_scope_, fetch_config_.fetch_var_names(i), - fetch_config_.fetch_var_str_format(i)); + int fetch_var_num = fetch_config_.fetch_var_names_size(); + + if (fetch_var_num == 0) { + return; + } + + if (thread_id_ == 0 && batch_num_ % batch_per_print == 0) { + time_t curtime; + time(&curtime); + char mbstr[80]; + std::strftime(mbstr, sizeof(mbstr), "%Y-%m-%d %H:%M:%S", + std::localtime(&curtime)); + + std::stringstream ss; + ss << "time: [" << mbstr << "], "; + ss << "batch: [" << batch_num_ << "], "; + + for (int i = 0; i < fetch_var_num; ++i) { + platform::PrintVar(thread_scope_, fetch_config_.fetch_var_names(i), + fetch_config_.fetch_var_str_format(i), &ss); + if (i < fetch_var_num - 1) { + ss << ", "; } } + + std::cout << ss.str() << std::endl; } } diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 0ca78c679aecaa396b59c7d50471baee239ba622..ab69170322ce3ec4eaa8e46b53e490b634df64b7 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -86,6 +86,7 @@ pass_library(quant_conv2d_dequant_fuse_pass inference) pass_library(shuffle_channel_detect_pass inference) pass_library(delete_quant_dequant_op_pass inference) pass_library(delete_quant_dequant_filter_op_pass inference) +pass_library(delete_dropout_op_pass inference) pass_library(simplify_with_basic_ops_pass base) pass_library(fc_elementwise_layernorm_fuse_pass base) pass_library(skip_layernorm_fuse_pass base) diff --git a/paddle/fluid/framework/ir/delete_dropout_op_pass.cc b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..09962239a01b1839bea93846ca3ffe9ded3cca4e --- /dev/null +++ b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include + +#include "paddle/fluid/framework/ir/delete_dropout_op_pass.h" + +namespace paddle { +namespace framework { +class LoDTensor; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(any_op_out); \ + GET_IR_NODE(dropout_op); \ + GET_IR_NODE(dropout_op_out); \ + GET_IR_NODE(dropout_op_outmask); \ + GET_IR_NODE(any_op2); + +void DeleteDropoutOpPass::ApplyImpl(ir::Graph* graph) const { + const std::string pattern_name = "delete_dropout_op_pattern"; + FusePassBase::Init(pattern_name, graph); + + GraphPatternDetector gpd; + + patterns::DeleteDropoutOpPattern pattern(gpd.mutable_pattern(), pattern_name); + pattern(); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + IR_NODE_LINK_TO(any_op_out, any_op2); + std::string any_op_out_name = any_op_out->Var()->Name(); + std::string dropout_op_out_name = dropout_op_out->Var()->Name(); + + auto* any_op2_desc = any_op2->Op(); + auto var_map = any_op2_desc->Inputs(); + std::string arg_name = ""; + for (auto& name_m : var_map) { + if (std::find(name_m.second.begin(), name_m.second.end(), + dropout_op_out_name) != name_m.second.end()) { + arg_name = name_m.first; + } + } + if (arg_name.size() == 0) { + LOG(INFO) << "Delete dropout op pass: can not find the input " + << dropout_op_out_name; + return; + } + + // modify the any_op2's inputs + for (auto& name_m : var_map) { + if (std::find(name_m.second.begin(), name_m.second.end(), + dropout_op_out_name) != name_m.second.end()) { + std::vector new_inputs; + for (auto& i_n : name_m.second) { + if (i_n != dropout_op_out_name) { + new_inputs.push_back(i_n); + } + } + new_inputs.push_back(any_op_out_name); + any_op2_desc->SetInput(name_m.first, new_inputs); + any_op2_desc->Flush(); + } + } + any_op2_desc->Flush(); + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph, + {dropout_op, dropout_op_out, dropout_op_outmask}); + }; + + gpd(graph, handler); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(delete_dropout_op_pass, + paddle::framework::ir::DeleteDropoutOpPass); diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc b/paddle/fluid/framework/ir/delete_dropout_op_pass.h similarity index 62% rename from paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc rename to paddle/fluid/framework/ir/delete_dropout_op_pass.h index 3f3b6b959e30194c10b1a58d6fc3e7a61ad01313..c49abf3c871ced474bc47e28ec32d29bc9ccf750 100644 --- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc +++ b/paddle/fluid/framework/ir/delete_dropout_op_pass.h @@ -12,16 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" +#pragma once +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { -namespace operators { -namespace distributed { +namespace framework { +namespace ir { + +class Graph; + +class DeleteDropoutOpPass : public FusePassBase { + public: + virtual ~DeleteDropoutOpPass() {} -std::once_flag AsyncSparseParamUpdateRecorder::init_flag_; -std::unique_ptr - AsyncSparseParamUpdateRecorder::recorder_(nullptr); + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; -} // namespace distributed -} // namespace operators +} // namespace ir +} // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc index 84c6b03e76bc1efd9e7d4c34b9b6151b16bf4040..48f79e63b4f0ea51df27695943690c1c36727e93 100644 --- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc @@ -34,15 +34,19 @@ namespace patterns { static PDNode* create_emb_vars(PDPattern* pattern, const std::string& name, const std::string& arg, bool is_persist = false) { + std::unordered_set embedding_ops{"lookup_table", + "lookup_table_v2"}; PDNode* node = - pattern->NewNode(name)->assert_is_op_input("lookup_table", arg); + pattern->NewNode(name)->assert_is_ops_input(embedding_ops, arg); if (is_persist) return node->assert_is_persistable_var(); return node; } static PDNode* create_emb_out_vars(PDPattern* pattern, const std::string& name, const std::string& arg) { + std::unordered_set embedding_ops{"lookup_table", + "lookup_table_v2"}; PDNode* node = pattern->NewNode(name) - ->assert_is_only_output_of_op("lookup_table") + ->assert_is_only_output_of_ops(embedding_ops) ->assert_is_op_input("elementwise_add", arg) ->AsIntermediate(); return node; @@ -56,10 +60,12 @@ void Embedding2Eltwise1Pattern::operator()() { create_emb_vars(pattern, lookup_table1_w_repr(), "W", true); auto* lookup_table2_w = create_emb_vars(pattern, lookup_table2_w_repr(), "W", true); + std::unordered_set embedding_ops{"lookup_table", + "lookup_table_v2"}; auto* lookup_table1 = - pattern->NewNode(lookup_table1_repr())->assert_is_op("lookup_table"); + pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops); auto* lookup_table2 = - pattern->NewNode(lookup_table2_repr())->assert_is_op("lookup_table"); + pattern->NewNode(lookup_table2_repr())->assert_is_ops(embedding_ops); auto* lookup_table1_out = create_emb_out_vars(pattern, lookup_table1_out_repr(), "X"); auto* lookup_table2_out = @@ -80,8 +86,10 @@ void Embedding1Eltwise1Pattern::operator()() { create_emb_vars(pattern, lookup_table1_x_repr(), "Ids"); auto* lookup_table1_w = create_emb_vars(pattern, lookup_table1_w_repr(), "W", true); + std::unordered_set embedding_ops{"lookup_table", + "lookup_table_v2"}; auto* lookup_table1 = - pattern->NewNode(lookup_table1_repr())->assert_is_op("lookup_table"); + pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops); auto* lookup_table1_out = create_emb_out_vars(pattern, lookup_table1_out_repr(), "Y"); auto* eltwise_add = @@ -282,15 +290,30 @@ static int BuildFusion(Graph* graph, const std::string& name_scope ids.push_back(inner_pattern_ins[js[iter]].first->Name()); embs.push_back(inner_pattern_ins[js[iter]].second->Name()); } + OpDesc new_op_desc; new_op_desc.SetType("fused_embedding_eltwise_layernorm"); new_op_desc.SetInput("Ids", ids); new_op_desc.SetInput("Embs", embs); + + new_op_desc.SetInput("WordId", {ids[0]}); + new_op_desc.SetInput("PosId", {ids[1]}); + new_op_desc.SetInput("SentId", {ids[2]}); + + new_op_desc.SetInput("WordEmbedding", {embs[0]}); + new_op_desc.SetInput("PosEmbedding", {embs[1]}); + new_op_desc.SetInput("SentEmbedding", {embs[2]}); + new_op_desc.SetInput("Bias", {end_pattern_biases[k]->Name()}); new_op_desc.SetInput("Scale", {end_pattern_scales[k]->Name()}); new_op_desc.SetOutput("Out", {end_pattern_out[k]->Name()}); new_op_desc.SetAttr("epsilon", end_patter_layernorms[k]->Op()->GetAttr("epsilon")); + + if (end_patter_layernorms[k]->Op()->HasAttr("out_threshold")) { + new_op_desc.SetAttr("enable_int8", true); + } + auto* embedding_eltwise_layernorm = graph->CreateOpNode(&new_op_desc); for (size_t iter = 0; iter < start_pattern_in_nodes[i].size(); ++iter) { @@ -347,4 +370,5 @@ REGISTER_PASS_CAPABILITY(embedding_eltwise_layernorm_fuse_pass) .AddCombination( paddle::framework::compatible::OpVersionComparatorCombination() .EQ("lookup_table", 0) + .LE("lookup_table_v2", 1) .EQ("elementweise_add", 0)); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index deb182c0fbe19c0ec9cb2e6f4b215b7983be3371..064da3d941602ee0e4f868fb0dbda305102da32b 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -652,6 +652,36 @@ PDNode *PDNode::assert_is_ops_input( return this; } +PDNode *PDNode::assert_is_only_input_of_ops( + const std::unordered_set &op_types) { + assert_is_var(); + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->outputs) { + if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type()) && + op->inputs.size() == 1) { + return true; + } + } + return false; + }); + return this; +} + +PDNode *PDNode::assert_is_only_output_of_ops( + const std::unordered_set &op_types) { + assert_is_var(); + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->inputs) { + if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type()) && + op->outputs.size() == 1) { + return true; + } + } + return false; + }); + return this; +} + bool VarLinksToOp(Node *node, const std::string &op_type) { for (auto *out : node->outputs) { if (out->IsOp() && out->Op()->Type() == op_type) { @@ -2409,6 +2439,29 @@ PDNode *patterns::TransposeFlattenConcat::operator()( return concat_out; } +void patterns::DeleteDropoutOpPattern::operator()() { + auto any_op_out = pattern->NewNode(any_op_out_repr()) + ->assert_is_op_input("dropout", "X") + ->AsInput(); + + auto dropout_op = + pattern->NewNode(dropout_op_repr())->assert_is_op("dropout"); + + auto dropout_op_out = pattern->NewNode(dropout_op_out_repr()) + ->assert_is_op_output("dropout", "Out") + ->AsIntermediate(); + + auto dropout_op_outmask = pattern->NewNode(dropout_op_outmask_repr()) + ->assert_is_op_output("dropout", "Mask") + ->AsOutput(); + auto any_op2 = pattern->NewNode(any_op2_repr())->assert_is_op()->AsOutput(); + + dropout_op->LinksFrom({any_op_out}); + dropout_op_out->LinksFrom({dropout_op}); + dropout_op_outmask->LinksFrom({dropout_op}); + any_op2->LinksFrom({dropout_op_out}); +} + void patterns::DeleteQuantOpFuse::operator()(PDNode *input_act_node, const std::string &quant_type) { auto *input_scale_node = pattern->NewNode(GetNodeName("input_scale_node")) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 2e518c1d4df72aae98b21233918e335d4286b3de..13f65859954d58ce446ab3b9de488833f6220dee 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -28,7 +28,6 @@ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/inference/analysis/dot.h" @@ -146,6 +145,11 @@ struct PDNode { const std::unordered_set& op_types, const std::string& argument, int nth); + PDNode* assert_is_only_input_of_ops( + const std::unordered_set& op_types); + PDNode* assert_is_only_output_of_ops( + const std::unordered_set& op_types); + PDNode* assert_has_n_inputs(size_t n); PDNode* assert_has_n_outputs(size_t n); @@ -1460,6 +1464,19 @@ struct ShuffleChannelPattern : public PatternBase { PATTERN_DECL_NODE(reshape2_out); }; +struct DeleteDropoutOpPattern : public PatternBase { + DeleteDropoutOpPattern(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "delete_dropout_op_pattern") {} + + void operator()(); + + PATTERN_DECL_NODE(any_op_out); + PATTERN_DECL_NODE(dropout_op); + PATTERN_DECL_NODE(dropout_op_out); + PATTERN_DECL_NODE(dropout_op_outmask); + PATTERN_DECL_NODE(any_op2); +}; + struct DeleteQuantDequantOpPattern : public PatternBase { DeleteQuantDequantOpPattern(PDPattern* pattern, const std::string& name_scope) : PatternBase(pattern, name_scope, "delete_quantdequant_op_pattern") {} diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc index 0a70440765d44d2eac39091c72d500ffb746089d..25bf03f426a1d9d77c17b26ece92943d71a9ed81 100644 --- a/paddle/fluid/framework/ir/is_test_pass.cc +++ b/paddle/fluid/framework/ir/is_test_pass.cc @@ -35,7 +35,7 @@ void IsTestPass::ApplyImpl(ir::Graph* graph) const { "hard_shrink", "hard_sigmoid", "relu6", "soft_relu", "swish", "thresholded_relu", "log", "square", "softplus", - "softsign"}; + "softsign", "silu"}; for (const Node* n : graph->Nodes()) { if (n->IsOp()) { auto* op = n->Op(); diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc index 69edc3d87f97d6762079a37c920f5ece57903cfa..18d2e9817ebec857e1b13d7d6e0e9f2201a69d94 100644 --- a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc @@ -14,7 +14,6 @@ #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h" #include "paddle/fluid/framework/op_version_registry.h" diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc index 5fd47b21733b54009954843fe02ae81f171f1554..5fe71fbc21451f13991cab4f612d251d028ac792 100644 --- a/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc @@ -17,7 +17,6 @@ #include #include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h" #include "paddle/fluid/framework/ir/pass_test_util.h" #include "paddle/fluid/framework/naive_executor.h" diff --git a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc index a2443c86986ec87cc29e9897fe0d38883f8fafa1..c36123f65f6644289cfba2b2729862efa601e2fd 100644 --- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc +++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc @@ -57,7 +57,7 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const { std::vector y_shape = matmul_in_y->Var()->GetShape(); size_t x_rank = x_shape.size(); size_t y_rank = y_shape.size(); - flag = flag && x_rank == 2 && y_rank == 2; + flag = flag && (x_rank == 2 || x_rank == 3) && y_rank == 2; std::vector& next_ops = matmul_out->outputs; flag = flag && next_ops.size() == 1 && @@ -69,7 +69,7 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const { desc.SetInput("X", {matmul_in_x->Name()}); desc.SetInput("Y", {matmul_in_y->Name()}); desc.SetOutput("Out", {matmul_out->Name()}); - desc.SetAttr("x_num_col_dims", 1); + desc.SetAttr("x_num_col_dims", static_cast(x_rank - 1)); desc.SetAttr("y_num_col_dims", 1); if (matmul_op->Op()->HasAttr("enable_int8")) { desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8")); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc index 3d7a9c1107bbaac04a3a478014520a9b340b1d5f..531a04e1a0d4c11799e8dea520faed447de4e808 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc @@ -53,7 +53,7 @@ void CPUBfloat16PlacementPass::SetMkldnnDataType( gpd(graph, handler); } -void CPUBfloat16PlacementPass::RemoveOrhanedOperators( +void CPUBfloat16PlacementPass::RemoveOrphanedOperators( ir::Graph* graph, int* bfloat16_operators) const { // find orphaned bfloat16 operator that is between two float32 operators // revert mkldnn_data_type attr to float32 @@ -74,7 +74,7 @@ void CPUBfloat16PlacementPass::RemoveOrhanedOperators( void CPUBfloat16PlacementPass::ApplyImpl(ir::Graph* graph) const { int bfloat16_operators = 0; SetMkldnnDataType(graph, &bfloat16_operators); - RemoveOrhanedOperators(graph, &bfloat16_operators); + RemoveOrphanedOperators(graph, &bfloat16_operators); PrettyLogDetail("--- marked %d operators to bfloat16 ", bfloat16_operators); } diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h index 1911b1a3cb32a6a23585e8240c462aa84e8d869b..53b97f0e9726aacf86f6f71d3382ab25241e3cdb 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h @@ -28,7 +28,7 @@ class CPUBfloat16PlacementPass : public Pass { protected: void SetMkldnnDataType(ir::Graph* graph, int* bfloat16_operators) const; - void RemoveOrhanedOperators(ir::Graph* graph, int* bfloat16_operators) const; + void RemoveOrphanedOperators(ir::Graph* graph, int* bfloat16_operators) const; void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc index 06df1caca35b922ac96d7d886296a6dee6bfb764..4eb532b47cb4b59cb3df0fe775400caa01354269 100644 --- a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc @@ -43,8 +43,9 @@ void InterpolateMKLDNNPass::ApplyImpl(ir::Graph* graph) const { int found_count = 0; const std::vector interpolate_op_types = { - "bilinear_interp", "nearest_interp", "trilinear_interp", "bicubic_interp", - "linear_interp"}; + "bilinear_interp", "nearest_interp", "trilinear_interp", + "bicubic_interp", "linear_interp", "bilinear_interp_v2", + "nearest_interp_v2"}; for (const Node* node : graph->Nodes()) { if (node->IsOp() && diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc index e20c0667ec3bc2834eccb2b70b0e741d1051f7ce..57bee20247c9644941f87db48406ef2b097a23fb 100644 --- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc +++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc @@ -535,6 +535,38 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope, multihead_op_desc.SetAttr("alpha", scale_attr); multihead_op_desc.SetAttr("head_number", head_number); + auto* mul0_op_desc = mul0->Op(); + auto* mul1_op_desc = mul1->Op(); + auto* mul2_op_desc = mul2->Op(); + if (mul0_op_desc->HasAttr("enable_int8")) { + multihead_op_desc.SetAttr("enable_int8", + mul0_op_desc->GetAttr("enable_int8")); + // all mul op has same input. + multihead_op_desc.SetAttr("Input_scale", + mul0_op_desc->GetAttr("X_scale")); + auto weight_scale0 = BOOST_GET_CONST( + std::vector, mul0_op_desc->GetAttr("weight_scale")); + auto weight_scale1 = BOOST_GET_CONST( + std::vector, mul1_op_desc->GetAttr("weight_scale")); + auto weight_scale2 = BOOST_GET_CONST( + std::vector, mul2_op_desc->GetAttr("weight_scale")); + auto weight_max = std::max(weight_scale0, weight_scale1); + weight_max = std::max(weight_max, weight_scale2); + multihead_op_desc.SetAttr("weight_scale", weight_max); + + if (mul0_op_desc->HasAttr("out_threshold")) { + auto out_scale0 = + BOOST_GET_CONST(float, mul0_op_desc->GetAttr("out_threshold")); + auto out_scale1 = + BOOST_GET_CONST(float, mul1_op_desc->GetAttr("out_threshold")); + auto out_scale2 = + BOOST_GET_CONST(float, mul2_op_desc->GetAttr("out_threshold")); + auto out_scale_max = std::max(out_scale0, out_scale1); + out_scale_max = std::max(out_scale_max, out_scale2); + multihead_op_desc.SetAttr("out_threshold", out_scale_max); + } + } + auto* multihead = graph->CreateOpNode(&multihead_op_desc); IR_NODE_LINK_TO(input0, multihead); @@ -682,6 +714,447 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope, return fusion_count; } +PDNode* MultiHeadMatmulV3Pattern::operator()() { + std::unordered_set matmul_ops{"matmul", "matmul_v2"}; + auto* input0 = pattern->NewNode(input0_repr()); + input0->assert_is_op_input("matmul"); + + // First path with scale + auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_op("matmul"); + auto* mul0_w_var = pattern->NewNode(mul0_w_repr()) + ->AsInput() + ->assert_is_op_input("matmul", "Y"); + auto* mul0_out_var = + pattern->NewNode(mul0_out_repr())->assert_is_op_output("matmul"); + + decltype(mul0) eltadd0; + decltype(mul0) eltadd0_b_var; + decltype(mul0) eltadd0_out_var; + + mul0_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + + eltadd0 = pattern->NewNode(eltadd0_repr())->assert_is_op("elementwise_add"); + eltadd0_b_var = pattern->NewNode(eltadd0_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + + eltadd0_out_var = pattern->NewNode(eltadd0_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd0_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_0 = + pattern->NewNode(reshape2_0_repr())->assert_is_op("reshape2"); + + auto* reshape2_0_out_var = + pattern->NewNode(reshape2_0_out_repr())->assert_is_op_output("reshape2"); + reshape2_0_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_0 = + pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2"); + auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_0_out_var->AsIntermediate()->assert_is_op_input("matmul", "X"); + + auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul"); + auto* matmul_qk_out_var = + pattern->NewNode(matmul_qk_out_repr())->assert_is_op_output("matmul"); + matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + + auto* eltadd_qk = + pattern->NewNode(eltadd_qk_repr())->assert_is_op("elementwise_add"); + auto* eltadd_qk_b_var = pattern->NewNode(eltadd_qk_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + auto* eltadd_qk_out_var = pattern->NewNode(eltadd_qk_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd_qk_out_var->AsIntermediate()->assert_is_op_input("softmax"); + + auto* softmax_qk = + pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax"); + auto* softmax_qk_out_var = + pattern->NewNode(softmax_qk_out_repr())->assert_is_op_output("softmax"); + softmax_qk_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops); + + auto* matmul_qkv = + pattern->NewNode(matmul_qkv_repr())->assert_is_ops(matmul_ops); + auto* matmul_qkv_out_var = + pattern->NewNode(matmul_qkv_out_repr())->assert_is_ops_output(matmul_ops); + matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_qkv = + pattern->NewNode(transpose2_qkv_repr())->assert_is_op("transpose2"); + auto* transpose2_qkv_out_var = pattern->NewNode(transpose2_qkv_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_qkv_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_qkv = + pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2"); + auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr()) + ->assert_is_op_output("reshape2"); + reshape2_qkv_out_var->assert_is_op_input("matmul"); + + // Second path to matmul + auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_op("matmul"); + auto* mul1_w_var = pattern->NewNode(mul1_w_repr()) + ->AsInput() + ->assert_is_op_input("matmul", "Y"); + auto* mul1_out_var = + pattern->NewNode(mul1_out_repr())->assert_is_op_output("matmul"); + + decltype(mul1) eltadd1; + decltype(mul1) eltadd1_b_var; + decltype(mul1) eltadd1_out_var; + + mul1_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + eltadd1 = pattern->NewNode(eltadd1_repr())->assert_is_op("elementwise_add"); + eltadd1_b_var = pattern->NewNode(eltadd1_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + + eltadd1_out_var = pattern->NewNode(eltadd1_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd1_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_1 = + pattern->NewNode(reshape2_1_repr())->assert_is_op("reshape2"); + + auto* reshape2_1_out_var = + pattern->NewNode(reshape2_1_out_repr())->assert_is_op_output("reshape2"); + reshape2_1_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_1 = + pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2"); + auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_1_out_var->AsIntermediate()->assert_is_op_input( + "matmul", "Y"); // link to matmul qk + + // Third path to matmul + auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_op("matmul"); + auto* mul2_w_var = pattern->NewNode(mul2_w_repr()) + ->AsInput() + ->assert_is_op_input("matmul", "Y"); + auto* mul2_out_var = + pattern->NewNode(mul2_out_repr())->assert_is_op_output("matmul"); + + decltype(mul2) eltadd2; + decltype(mul2) eltadd2_b_var; + decltype(mul2) eltadd2_out_var; + + mul2_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + eltadd2 = pattern->NewNode(eltadd2_repr())->assert_is_op("elementwise_add"); + eltadd2_b_var = pattern->NewNode(eltadd2_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + + eltadd2_out_var = pattern->NewNode(eltadd2_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd2_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_2 = + pattern->NewNode(reshape2_2_repr())->assert_is_op("reshape2"); + + auto* reshape2_2_out_var = + pattern->NewNode(reshape2_2_out_repr())->assert_is_op_output("reshape2"); + reshape2_2_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_2 = + pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2"); + auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_2_out_var->AsIntermediate()->assert_is_ops_input( + matmul_ops); // link to matmul qkv + + // Q path + mul0->LinksFrom({input0, mul0_w_var}).LinksTo({mul0_out_var}); + eltadd0->LinksFrom({mul0_out_var, eltadd0_b_var}).LinksTo({eltadd0_out_var}); + + reshape2_0->LinksFrom({eltadd0_out_var}).LinksTo({reshape2_0_out_var}); + transpose2_0->LinksFrom({reshape2_0_out_var}).LinksTo({transpose2_0_out_var}); + // K path + mul1->LinksFrom({input0, mul1_w_var}).LinksTo({mul1_out_var}); + eltadd1->LinksFrom({mul1_out_var, eltadd1_b_var}).LinksTo({eltadd1_out_var}); + reshape2_1->LinksFrom({eltadd1_out_var}).LinksTo({reshape2_1_out_var}); + transpose2_1->LinksFrom({reshape2_1_out_var}).LinksTo({transpose2_1_out_var}); + // compute q*k + matmul_qk->LinksFrom({transpose2_0_out_var, transpose2_1_out_var}) + .LinksTo({matmul_qk_out_var}); + eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var}) + .LinksTo({eltadd_qk_out_var}); + softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var}); + // V path + mul2->LinksFrom({input0, mul2_w_var}).LinksTo({mul2_out_var}); + eltadd2->LinksFrom({mul2_out_var, eltadd2_b_var}).LinksTo({eltadd2_out_var}); + reshape2_2->LinksFrom({eltadd2_out_var}).LinksTo({reshape2_2_out_var}); + transpose2_2->LinksFrom({reshape2_2_out_var}).LinksTo({transpose2_2_out_var}); + // compute q*k*v + matmul_qkv->LinksFrom({softmax_qk_out_var, transpose2_2_out_var}) + .LinksTo({matmul_qkv_out_var}); + transpose2_qkv->LinksFrom({matmul_qkv_out_var}) + .LinksTo({transpose2_qkv_out_var}); + reshape2_qkv->LinksFrom({transpose2_qkv_out_var}) + .LinksTo({reshape2_qkv_out_var}); + + return transpose2_2_out_var; +} + +static int BuildFusionV3(Graph* graph, const std::string& name_scope, + Scope* scope) { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + + // Create pattern. + MultiHeadMatmulV3Pattern multihead_pattern(pattern, name_scope); + + multihead_pattern(); + // Create New OpDesc + auto fuse_creater = [&]( + Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out, + Node* mul1_out, Node* mul2_out, Node* mul0_w, Node* mul1_w, Node* mul2_w, + Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, Node* eltadd_qk_b, + Node* reshape2, Node* reshape2_qkv_out, Node* matmul_qk) { + auto scale_attr = BOOST_GET_CONST(float, matmul_qk->Op()->GetAttr("alpha")); + + // mul (B * S * Hidden) x (Hidden * 3 * N * H) = (B * S * 3 * N * H) + // bias (B * S * 3 * N * H) + bias (3 * N * H) + // Transpose (B * S * 3 * N * H) -> (3 * B * N * S * H) + auto* wq_tensor = scope->FindVar(mul0_w->Name())->GetMutable(); + auto* wk_tensor = scope->FindVar(mul1_w->Name())->GetMutable(); + auto* wv_tensor = scope->FindVar(mul2_w->Name())->GetMutable(); + + auto* bq_tensor = + scope->FindVar(eltadd0_b->Name())->GetMutable(); + auto* bk_tensor = + scope->FindVar(eltadd1_b->Name())->GetMutable(); + auto* bv_tensor = + scope->FindVar(eltadd2_b->Name())->GetMutable(); + + auto* wq_data = wq_tensor->mutable_data(platform::CPUPlace()); + auto* wk_data = wk_tensor->mutable_data(platform::CPUPlace()); + auto* wv_data = wv_tensor->mutable_data(platform::CPUPlace()); + auto* bq_data = bq_tensor->mutable_data(platform::CPUPlace()); + auto* bk_data = bk_tensor->mutable_data(platform::CPUPlace()); + auto* bv_data = bv_tensor->mutable_data(platform::CPUPlace()); + + auto combined_w_dims = + framework::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]}); + auto combined_bias_dims = framework::make_ddim({3, bq_tensor->dims()[0]}); + + // reuse the mul0_w and eltadd_0_b nodes for the combined nodes. + auto* combined_w_desc = mul0_w->Var(); + combined_w_desc->SetShape({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]}); + combined_w_desc->SetPersistable(true); + + auto* combined_bias_desc = eltadd0_b->Var(); + combined_bias_desc->SetShape({3, bq_tensor->dims()[0]}); + combined_bias_desc->SetPersistable(true); + + framework::LoDTensor tmp_combined_w_tensor; + tmp_combined_w_tensor.Resize(combined_w_dims); + auto* tmp_combined_w_data = + tmp_combined_w_tensor.mutable_data(platform::CPUPlace()); + + std::vector w_vec = {wq_data, wk_data, wv_data}; + int dims_h = combined_w_dims[0], dims_w = combined_w_dims[2]; + // Combine the three fc weights together. + for (int i = 0; i < dims_h; i++) { + for (int j = 0; j < 3; j++) { + for (int k = 0; k < dims_w; k++) { + int out_index = i * (3 * dims_w) + j * dims_w + k; + int in_index = i * dims_w + k; + tmp_combined_w_data[out_index] = w_vec[j][in_index]; + } + } + } + + wq_tensor->Resize(combined_w_dims); + auto* new_combined_w_data = + wq_tensor->mutable_data(platform::CPUPlace()); + memcpy(new_combined_w_data, tmp_combined_w_data, + sizeof(float) * wq_tensor->numel()); + + scope->EraseVars({mul1_w->Name(), mul2_w->Name()}); + + framework::LoDTensor tmp_combined_bias_tensor; + tmp_combined_bias_tensor.Resize(combined_bias_dims); + auto* tmp_combined_bias_data = + tmp_combined_bias_tensor.mutable_data(platform::CPUPlace()); + + size_t bias_size = bq_tensor->numel(); + memcpy(tmp_combined_bias_data, bq_data, sizeof(float) * bias_size); + memcpy(tmp_combined_bias_data + bias_size, bk_data, + sizeof(float) * bias_size); + memcpy(tmp_combined_bias_data + 2 * bias_size, bv_data, + sizeof(float) * bias_size); + + bq_tensor->Resize(combined_bias_dims); + auto* new_combined_bias_data = + bq_tensor->mutable_data(platform::CPUPlace()); + memcpy(new_combined_bias_data, tmp_combined_bias_data, + sizeof(float) * bq_tensor->numel()); + + scope->EraseVars({eltadd1_b->Name(), eltadd2_b->Name()}); + + auto reshape_desc = reshape2->Op(); + int head_number = + BOOST_GET_CONST(std::vector, reshape_desc->GetAttr("shape")).at(2); + + OpDesc multihead_op_desc; + multihead_op_desc.SetType("multihead_matmul"); + + multihead_op_desc.SetInput("Input", {input0->Name()}); + multihead_op_desc.SetInput("W", {mul0_w->Name()}); + multihead_op_desc.SetInput("Bias", {eltadd0_b->Name()}); + multihead_op_desc.SetInput("BiasQK", {eltadd_qk_b->Name()}); + + multihead_op_desc.SetOutput("Out", {reshape2_qkv_out->Name()}); + multihead_op_desc.SetAttr("alpha", scale_attr); + multihead_op_desc.SetAttr("head_number", head_number); + + auto* multihead = graph->CreateOpNode(&multihead_op_desc); + + IR_NODE_LINK_TO(input0, multihead); + IR_NODE_LINK_TO(mul0_w, multihead); + IR_NODE_LINK_TO(eltadd0_b, multihead); + IR_NODE_LINK_TO(eltadd_qk_b, multihead); + + IR_NODE_LINK_TO(multihead, reshape2_qkv_out); + }; + + int fusion_count{0}; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + // GET_IR_NODE_FROM_SUBGRAPH(dropout_out, dropout_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(input0, input0, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(mul0, mul0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul0_out, mul0_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul0_w, mul0_w, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_0, reshape2_0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_0_out, reshape2_0_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_0, transpose2_0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_0_out, transpose2_0_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(mul1, mul1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul1_out, mul1_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul1_w, mul1_w, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_1, reshape2_1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_1_out, reshape2_1_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_1, transpose2_1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_1_out, transpose2_1_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(mul2, mul2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul2_out, mul2_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul2_w, mul2_w, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_2, reshape2_2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_2_out, reshape2_2_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_2, transpose2_2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_2_out, transpose2_2_out, + multihead_pattern); + + // nodes need be removed + GET_IR_NODE_FROM_SUBGRAPH(eltadd0, eltadd0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd0_b, eltadd0_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd0_out, eltadd0_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(eltadd1, eltadd1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd1_b, eltadd1_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd1_out, eltadd1_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(eltadd2, eltadd2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd2_b, eltadd2_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd2_out, eltadd2_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(matmul_qk, matmul_qk, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_qk_out, matmul_qk_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk, eltadd_qk, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_b, eltadd_qk_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_out, eltadd_qk_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(softmax_qk, softmax_qk, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(softmax_qk_out, softmax_qk_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv, matmul_qkv, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv_out, matmul_qkv_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv, reshape2_qkv, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv_out, reshape2_qkv_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv, transpose2_qkv, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out, transpose2_qkv_out, + multihead_pattern); + + // If weights or biases in qkv's fc are shared by multiple multihead_matmul + // patterns, we do not support this kind of fusion, this pass will not take + // effect. + bool is_fc_params_shared = + mul0_w->outputs.size() > 1 || mul1_w->outputs.size() > 1 || + mul2_w->outputs.size() > 1 || eltadd0_b->outputs.size() > 1 || + eltadd1_b->outputs.size() > 1 || eltadd2_b->outputs.size() > 1; + if (is_fc_params_shared) { + return; + } + fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out, mul0_w, + mul1_w, mul2_w, eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b, + reshape2_0, reshape2_qkv_out, matmul_qk); + + std::unordered_set marked_nodes({eltadd0, + eltadd1, + eltadd2, + eltadd1_b, + eltadd2_b, + eltadd0_out, + eltadd1_out, + eltadd2_out, + reshape2_0, + reshape2_1, + reshape2_2, + reshape2_0_out, + reshape2_1_out, + reshape2_2_out, + transpose2_0, + transpose2_1, + transpose2_2, + transpose2_0_out, + transpose2_1_out, + transpose2_2_out, + matmul_qk, + matmul_qk_out, + eltadd_qk, + eltadd_qk_out, + softmax_qk, + softmax_qk_out, + transpose2_qkv, + transpose2_qkv_out, + matmul_qkv, + matmul_qkv_out, + mul0, + mul1, + mul2, + mul0_out, + mul1_out, + mul2_out, + mul1_w, + mul2_w, + reshape2_qkv}); + // Remove unneeded nodes. + GraphSafeRemoveNodes(graph, marked_nodes); + ++fusion_count; + }; + gpd(graph, handler); + + return fusion_count; +} + } // namespace patterns void MultiHeadMatmulFusePass::ApplyImpl(Graph* graph) const { @@ -706,6 +1179,21 @@ void MultiHeadMatmulV2FusePass::ApplyImpl(Graph* graph) const { AddStatis(fusion_count); } +void MultiHeadMatmulV3FusePass::ApplyImpl(Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + auto* scope = param_scope(); + PADDLE_ENFORCE_NOT_NULL( + scope, + platform::errors::Fatal( + "During the multiheadMatmul pass, The scope should not be null.")); + + int fusion_count = patterns::BuildFusionV3(graph, name_scope_, scope); + if (fusion_count > 0) { + graph->Set(kMultiheadMatmulPass, new bool(true)); + } + AddStatis(fusion_count); +} + } // namespace ir } // namespace framework } // namespace paddle @@ -715,6 +1203,8 @@ REGISTER_PASS(multihead_matmul_fuse_pass, REGISTER_PASS(multihead_matmul_fuse_pass_v2, paddle::framework::ir::MultiHeadMatmulV2FusePass); +REGISTER_PASS(multihead_matmul_fuse_pass_v3, + paddle::framework::ir::MultiHeadMatmulV3FusePass); REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v2) .AddCombination( paddle::framework::compatible::OpVersionComparatorCombination() @@ -725,3 +1215,13 @@ REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v2) .EQ("scale", 0) .LE("matmul", 1) .EQ("softmax", 0)); +REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v3) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .LE("elementwise_add", 1) + .EQ("reshape2", 0) + .EQ("transpose2", 0) + .EQ("scale", 0) + .LE("matmul", 1) + .EQ("matmul_v2", 0) + .EQ("softmax", 0)); diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h index f5327dc71080be9edff30855a157465e0b35712a..c7f1336211d3463846a61b998c4f12f11095de32 100644 --- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h +++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h @@ -89,9 +89,63 @@ struct MultiHeadMatmulPattern : public PatternBase { PATTERN_DECL_NODE(matmul_qkv); PATTERN_DECL_NODE(matmul_qkv_out); }; + +struct MultiHeadMatmulV3Pattern : public PatternBase { + MultiHeadMatmulV3Pattern(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "multihead_matmul_v3") {} + + PDNode* operator()(); + + // declare operator node's name + PATTERN_DECL_NODE(input0); + PATTERN_DECL_NODE(mul0); + PATTERN_DECL_NODE(mul1); + PATTERN_DECL_NODE(mul2); + PATTERN_DECL_NODE(mul0_w); + PATTERN_DECL_NODE(mul1_w); + PATTERN_DECL_NODE(mul2_w); + PATTERN_DECL_NODE(mul0_out); + PATTERN_DECL_NODE(mul1_out); + PATTERN_DECL_NODE(mul2_out); + PATTERN_DECL_NODE(eltadd0); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd1); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd2); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd0_b); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd1_b); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd2_b); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd0_out); + PATTERN_DECL_NODE(eltadd1_out); + PATTERN_DECL_NODE(eltadd2_out); + PATTERN_DECL_NODE(reshape2_0); + PATTERN_DECL_NODE(reshape2_1); + PATTERN_DECL_NODE(reshape2_2); + PATTERN_DECL_NODE(reshape2_qkv); + PATTERN_DECL_NODE(reshape2_0_out); + PATTERN_DECL_NODE(reshape2_1_out); + PATTERN_DECL_NODE(reshape2_2_out); + PATTERN_DECL_NODE(reshape2_qkv_out); + PATTERN_DECL_NODE(transpose2_0); + PATTERN_DECL_NODE(transpose2_1); + PATTERN_DECL_NODE(transpose2_2); + PATTERN_DECL_NODE(transpose2_qkv); + PATTERN_DECL_NODE(transpose2_0_out); + PATTERN_DECL_NODE(transpose2_1_out); + PATTERN_DECL_NODE(transpose2_2_out); + PATTERN_DECL_NODE(transpose2_qkv_out); + PATTERN_DECL_NODE(matmul_qk); + PATTERN_DECL_NODE(matmul_qk_out); + PATTERN_DECL_NODE(eltadd_qk); + PATTERN_DECL_NODE(eltadd_qk_b); + PATTERN_DECL_NODE(eltadd_qk_out); + PATTERN_DECL_NODE(softmax_qk); + PATTERN_DECL_NODE(softmax_qk_out); + + PATTERN_DECL_NODE(matmul_qkv); + PATTERN_DECL_NODE(matmul_qkv_out); +}; + } // namespace patterns -// The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op. class MultiHeadMatmulFusePass : public FusePassBase { public: virtual ~MultiHeadMatmulFusePass() {} @@ -112,6 +166,16 @@ class MultiHeadMatmulV2FusePass : public FusePassBase { const std::string name_scope_{"multihead_matmul_fuse_v2"}; }; +class MultiHeadMatmulV3FusePass : public FusePassBase { + public: + virtual ~MultiHeadMatmulV3FusePass() {} + + protected: + void ApplyImpl(Graph* graph) const; + + const std::string name_scope_{"multihead_matmul_fuse_v3"}; +}; + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/placement_pass_base.cc b/paddle/fluid/framework/ir/placement_pass_base.cc index fd604ffe7b5de440fb3509a01fd2a1bc1a553574..35ba92006077999a541e700c6884db0d32f0bfab 100644 --- a/paddle/fluid/framework/ir/placement_pass_base.cc +++ b/paddle/fluid/framework/ir/placement_pass_base.cc @@ -77,7 +77,8 @@ bool PlacementPassBase::IsDefaultOpTypes(const std::string& op_type) const { // the corresponding pass. const std::vector not_default_op_types = { "bilinear_interp", "nearest_interp", "trilinear_interp", - "bicubic_interp", "linear_interp"}; + "bicubic_interp", "linear_interp", "bilinear_interp_v2", + "linear_interp_v2"}; bool is_interpolate_op = std::find(not_default_op_types.begin(), not_default_op_types.end(), op_type) != not_default_op_types.end(); diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc index 5043fce8885cdefc24b52c0321d83b411ccd5db4..2fc39fd25d56c18ac510b550186eccaeb6eb9030 100644 --- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc +++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc @@ -225,10 +225,13 @@ void FuseDequant(ir::Graph* graph, Scope* scope, quantized_op_type == "depthwise_conv2d") { PADDLE_ENFORCE_EQ( dequant_type, "fake_channel_wise_dequantize_max_abs", - platform::errors::InvalidArgument("conv2d op must be dequantized by " - "[fake_channel_wise_dequantize_max_" - "abs], but got %s", - dequant_type)); + platform::errors::InvalidArgument( + "conv2d op must be dequantized by " + "[fake_channel_wise_dequantize_max_abs], but got %s. " + "If you uses PaddleSlim to generate the quantized " + "model, please set the 'weight_quantize_type' params as " + "'channel_wise_abs_max' and generate the quantized model again.", + dequant_type)); PADDLE_ENFORCE_EQ( weight_scale.size(), static_cast(w_dims[0]), platform::errors::InvalidArgument( diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc index 479df876fbe007119c55261dd149bd515b0cd117..bf59c140005167e3be342b4039d2b13e5bddf1c6 100644 --- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc @@ -54,6 +54,17 @@ static bool IsFCWithAct(Node* n, const std::string& act_type = "relu") { return false; } +static bool IsFCWithPaddingWeights(Node* n) { + bool res = false; + if (n && n->IsOp() && n->Op() && n->Op()->Type() == "fc" && + n->inputs.size() == 3U && n->outputs.size() == 1U) { + if (n->Op()->HasAttr("padding_weights")) { + res = BOOST_GET_CONST(bool, n->Op()->GetAttr("padding_weights")); + } + } + return res; +} + static bool IsParamOfFC(Node* n, const std::string& param_name) { if (IsInputOfFC(n) && n->inputs.empty() && (n->Name() == n->outputs[0]->Op()->Input(param_name)[0])) { @@ -255,7 +266,7 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern, fc_ops[i] = pattern->NewNode( [=](Node* x) { - if (!IsFCWithAct(x, "relu")) { + if (!IsFCWithAct(x, "relu") || IsFCWithPaddingWeights(x)) { return false; } auto* fc_out_var = x->outputs[0]; diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc index ada20113077c18080e358849073c3703e881d262..232e1d8da4ded39df732912bc86edb9a1fb54317 100644 --- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc @@ -141,14 +141,6 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const { GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance, fused_pattern); - // check if is in ernie or not - if (!graph->Has(kEmbEltwiseLayernormPass) || - !graph->Has(kMultiheadMatmulPass)) { - LOG(INFO) << "The skip_layernorm_fuse_pass is only supported in " - << "Ernie/Bert model. Just skip this pass."; - return; - } - std::unordered_set del_node_set; // Create an SkipLayerNorm op node @@ -161,6 +153,10 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const { new_desc.SetInput("Scale", {layer_norm_scale->Name()}); new_desc.SetInput("Bias", {layer_norm_bias->Name()}); + if (elementwise->Op()->HasAttr("out_threshold")) { + new_desc.SetAttr("enable_int8", true); + } + // outputs new_desc.SetOutput("Out", {layer_norm_out->Name()}); diff --git a/paddle/fluid/framework/library_type.h b/paddle/fluid/framework/library_type.h index 4307e51862df572e013431fceaaf89cc1cf6679c..8fe314cf5f18c5e8cc0a56ca8f231d32b9896aaf 100644 --- a/paddle/fluid/framework/library_type.h +++ b/paddle/fluid/framework/library_type.h @@ -61,6 +61,8 @@ inline LibraryType StringToLibraryType(const char* ctype) { return LibraryType::kPlain; } else if (s == std::string("XPU")) { return LibraryType::kPlain; + } else if (s == std::string("NPU")) { + return LibraryType::kPlain; } else if (s == std::string("CUDA")) { return LibraryType::kPlain; } else { diff --git a/paddle/fluid/framework/load_op_lib.h b/paddle/fluid/framework/load_op_lib.h deleted file mode 100644 index 16cffe119d63e0cb8bd6ff76f4ac5792127f480d..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/load_op_lib.h +++ /dev/null @@ -1,120 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/op_desc.h" -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" - -namespace paddle { -namespace framework { - -template -T *DynLoad(void *handle, std::string name) { - T *func = reinterpret_cast(dlsym(handle, name.c_str())); -#if !defined(_WIN32) - auto errorno = dlerror(); -#else - auto errorno = GetLastError(); -#endif // !_WIN32 - PADDLE_ENFORCE_NOT_NULL( - func, - platform::errors::NotFound( - "Failed to load dynamic operator library, error code(%s).", errorno)); - return func; -} - -void LoadOpLib(const std::string &dso_name) { - void *handle = paddle::platform::dynload::GetOpDsoHandle(dso_name); - - typedef OpInfoMap &get_op_info_t(); - get_op_info_t *get_op_info = - DynLoad(handle, "PD_GetOpInfoMap"); - auto &op_info = get_op_info(); - auto *dyn_info_map = op_info.mutable_map(); - - typedef std::vector grad_op_desc_maker_t( - const OpDesc &, const std::unordered_set &, - std::unordered_map *, - const std::vector &); - - grad_op_desc_maker_t *grad_op_desc_maker = - DynLoad(handle, "PD_GetGradOpDescStrs"); - - auto &info_map = OpInfoMap::Instance(); - for (const auto &n : *(dyn_info_map)) { - auto type = n.first; - if (type == "recurrent" || type == "recurrent_grad" || - type == "conditional_block" || type == "conditional_block_grad") { - continue; - } - PADDLE_ENFORCE_NE(info_map.Has(n.first), true, - platform::errors::AlreadyExists( - "Operator (%s) has been registered.", type)); - OpInfo info; - info.creator_ = n.second.creator_; - - // If get the protocol buffer from dynamic library directly, there - // will be deconstruction error - // ** Error in `python`: free(): invalid pointer: - // ... paddle::framework::proto::OpDesc::SharedDtor() - // It seems a bug in protobuf, see - // https://github.com/protocolbuffers/protobuf/issues/435 - // So, get the serialized binary string from dynamic library, - // then deserialize to protocol buffer. - info.grad_op_maker_ = [grad_op_desc_maker]( - const OpDesc &op_desc, - const std::unordered_set &no_grad_set, - std::unordered_map *grad_to_var, - const std::vector &grad_block) { - std::vector strs = - grad_op_desc_maker(op_desc, no_grad_set, grad_to_var, grad_block); - std::vector> ret; - for (auto &str : strs) { - proto::OpDesc proto_desc; - PADDLE_ENFORCE_EQ(proto_desc.ParseFromString(str), true, - platform::errors::InvalidArgument( - "Failed to parse OpDesc from string.")); - ret.emplace_back(new OpDesc(proto_desc, nullptr)); - } - return ret; - }; - info.proto_ = n.second.proto_; - info.checker_ = n.second.checker_; - info.infer_var_type_ = n.second.infer_var_type_; - info.infer_shape_ = n.second.infer_shape_; - info.infer_inplace_ = n.second.infer_inplace_; - info.infer_no_need_buffer_vars_ = n.second.infer_no_need_buffer_vars_; - info.use_default_grad_op_desc_maker_ = - n.second.use_default_grad_op_desc_maker_; - info.use_empty_grad_op_desc_maker_ = n.second.use_empty_grad_op_desc_maker_; - - info_map.Insert(type, info); - } - - typedef void init_device_t(platform::DeviceContextPool *); - init_device_t *init_dev = - DynLoad(handle, "PD_InitDevicesPool"); - init_dev(&(platform::DeviceContextPool::Instance())); -} - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index 3a79452e230ef4f340fbc5464236063193d9b28f..0a6b5e44452fe191fce5fea058194a92e3a406de 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -268,6 +268,21 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor, TensorToStream(os, static_cast(tensor), dev_ctx); } +void SerializeToStream(std::ostream &os, const LoDTensor &tensor) { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + const platform::DeviceContext *dev_ctx; + auto place = tensor.place(); + dev_ctx = pool.Get(place); + SerializeToStream(os, tensor, *dev_ctx); +} + +void DeserializeFromStream(std::ifstream &os, LoDTensor *tensor) { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + const platform::DeviceContext *dev_ctx; + dev_ctx = pool.Get(platform::CPUPlace()); + DeserializeFromStream(os, tensor, *dev_ctx); +} + void DeserializeFromStream(std::istream &is, LoDTensor *tensor, const platform::DeviceContext &dev_ctx, const size_t &seek, diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h index b8911154e6bf7b3b3fbd5946e21401dba4002929..6b357aba1c5f9a4c0db53b20a9d47e64b71d0a11 100644 --- a/paddle/fluid/framework/lod_tensor.h +++ b/paddle/fluid/framework/lod_tensor.h @@ -14,16 +14,11 @@ limitations under the License. */ #pragma once +#include #include #include #include #include -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include -#include -#endif - -#include #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/mixed_vector.h" @@ -260,5 +255,9 @@ LoD ConvertToLengthBasedLoD(const LoD& offset_lod); LoD ConvertToOffsetBasedLoD(const LoD& length_lod); +void SerializeToStream(std::ostream& os, const LoDTensor& tensor); + +void DeserializeFromStream(std::ifstream& os, LoDTensor* tensor); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc index a9e15ee1758070794e7d933d2e9c093186ac60a5..e347f4ab387b82029a304e0aa2ea59a6c446f189 100644 --- a/paddle/fluid/framework/multi_trainer.cc +++ b/paddle/fluid/framework/multi_trainer.cc @@ -38,6 +38,13 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc, need_merge_var_names_.push_back( trainer_desc.downpour_param().stat_var_names(i)); } +#ifdef PADDLE_WITH_HETERPS + for (int i = 0; i < thread_num_; ++i) { + int num = trainer_desc.worker_places(i); + platform::CUDAPlace place = platform::CUDAPlace(num); + places_.push_back(place); + } +#endif // get filelist from trainer_desc here const std::vector readers = dataset->GetReaders(); @@ -102,13 +109,42 @@ void MultiTrainer::InitDumpEnv() { void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program, const platform::Place& place) { for (int i = 0; i < thread_num_; ++i) { +#ifdef PADDLE_WITH_HETERPS + workers_[i]->SetPlace(places_[i]); + workers_[i]->SetReaderPlace(places_[i]); +#else workers_[i]->SetPlace(place); workers_[i]->SetReaderPlace(place); +#endif workers_[i]->SetRootScope(root_scope_); workers_[i]->CreateDeviceResource(main_program); // Program workers_[i]->BindingDataFeedMemory(); workers_[i]->CacheProgram(main_program); } +#ifdef PADDLE_WITH_HETERPS + for (int num = 0; num < thread_num_; ++num) { + auto place = places_[num]; + Scope* scope = workers_[num]->GetThreadScope(); + auto& block = main_program.Block(0); + for (auto& var : block.AllVars()) { + if (var->Persistable()) { + auto name = var->Name(); + Variable* root_var = root_scope_->FindVar(name); + if (!root_var) { + continue; + } + if (root_var->IsType()) { + continue; + } + LoDTensor* root_tensor = root_var->GetMutable(); + auto* ptr = scope->Var(name); + InitializeVariable(ptr, proto::VarType::LOD_TENSOR); + LoDTensor* thread_tensor = ptr->GetMutable(); + TensorCopy(*root_tensor, place, thread_tensor); + } + } + } +#endif } void MultiTrainer::InitOtherEnv(const ProgramDesc& main_program) { @@ -139,10 +175,79 @@ void MultiTrainer::Run() { } } +#ifdef PADDLE_WITH_HETERPS +void MultiTrainer::MergeDenseParam() { +#ifdef PADDLE_WTIH_PSCORE + auto communicator = paddle::distributed::Communicator::GetInstance(); + auto& recv_ctx = communicator->GetRecvCtxMap(); + Scope* thread_scope = workers_[0]->GetThreadScope(); + for (auto& iter : recv_ctx) { + auto& varnames = iter.second; + for (auto& name : varnames) { + Variable* root_var = root_scope_->FindVar(name); + LoDTensor* root_tensor = root_var->GetMutable(); + Variable* var = thread_scope->FindVar(name); + LoDTensor* tensor = var->GetMutable(); + TensorCopy((*tensor), root_tensor->place(), root_tensor); + } + } +#endif +} +#endif + +template +void MultiTrainer::MergeToRootScope(LoDTensor* root_tensor, LoDTensor* tensor) { + LoDTensor tmp_root; + TensorCopy(*root_tensor, platform::CPUPlace(), &tmp_root); + T* tmp_root_data = tmp_root.data(); + LoDTensor tmp_tensor; + TensorCopy(*tensor, platform::CPUPlace(), &tmp_tensor); + T* data = tmp_tensor.data(); + for (int i = 0; i < tmp_tensor.numel(); i++) { + tmp_root_data[i] += data[i]; + } + TensorCopy(tmp_root, platform::CPUPlace(), root_tensor); +} + void MultiTrainer::Finalize() { if (need_dump_field_ || need_dump_param_) { FinalizeDumpEnv(); } +#ifdef PADDLE_WITH_HETERPS + for (size_t i = 0; i < need_merge_var_names_.size(); i++) { + Variable* root_var = root_scope_->FindVar(need_merge_var_names_[i]); + if (root_var == nullptr) { + continue; + } + LoDTensor* root_tensor = root_var->GetMutable(); + + for (size_t j = 0; j < places_.size(); j++) { + Scope* cur_thread_scope = workers_[j]->GetThreadScope(); + Variable* thread_var = + cur_thread_scope->FindVar(need_merge_var_names_[i]); + if (thread_var == nullptr) { + continue; + } + LoDTensor* thread_tensor = thread_var->GetMutable(); +#define MergeCallback(cpp_type, proto_type) \ + do { \ + if (root_tensor->type() == proto_type) { \ + if (thread_tensor->type() != proto_type) { \ + VLOG(0) << "Error: thread id=" << j << ", need_merge_var_names_[" << i \ + << "] " << need_merge_var_names_[i] \ + << ", root tensor type=" << root_tensor->type() \ + << ", thread tensor type=" << thread_tensor->type(); \ + exit(-1); \ + } \ + MergeToRootScope(root_tensor, thread_tensor); \ + } \ + } while (0) + _ForEachDataType_(MergeCallback); + } + } + MergeDenseParam(); + +#endif root_scope_->DropKids(); } diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index f107321958ba7be4d3ba31bd128f0cbbad694b85..7d55d8c41e3e92349dc9986b3d236db2ebdac01b 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -128,7 +128,7 @@ NaiveExecutor::~NaiveExecutor() { #ifdef PADDLE_WITH_MKLDNN // Clear mkl-dnn cache, // this is needed to have mkl-dnn unit tests working - ClearMKLDNNCache(place_); + ClearMKLDNNCache(place_, this); #endif } diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 7af5c54ceed74fc334dd10438cc5b0b62c06042d..519bf8c633a013fedab4f529dad014a71ad2d594 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -447,6 +447,11 @@ void OpDesc::SetOutput(const std::string ¶m_name, this->outputs_[param_name] = args; } +void OpDesc::RemoveOutput(const std::string &name) { + outputs_.erase(name); + need_update_ = true; +} + bool OpDesc::HasProtoAttr(const std::string &name) const { auto &op_info = OpInfoMap::Instance(); if (op_info.Has(desc_.type())) { diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 95c33bca6c7f1df6ad71a3b4c2f82d726cafb5fc..1bc1a308e453bb816b00d6ca9a62358f8d33082a 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -65,6 +65,7 @@ class OpDesc { void SetOutput(const std::string ¶m_name, const std::vector &args); + void RemoveOutput(const std::string &name); bool HasAttr(const std::string &name) const { return attrs_.find(name) != attrs_.end(); diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h index af657232e91a68aa26ab85faf63acdd1b8f191d1..ddd84bfd81abf53de3ad534cac44374dde631195 100644 --- a/paddle/fluid/framework/op_info.h +++ b/paddle/fluid/framework/op_info.h @@ -20,7 +20,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/attribute.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 912e82f60ef5df81d163e1d8b937eb026e51478a..506c3eb1e0ad09658e431f03ac8231b4c796ec83 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "glog/logging.h" #include "paddle/fluid/framework/attribute.h" -#include "paddle/fluid/framework/framework.pb.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 472c6f408266af6b47b7fdad2d1c9b3be6ee8cf5..593d4d839fa910d2ef81b3ae7483cee4399926cb 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -25,7 +25,8 @@ limitations under the License. */ #include #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h -#include "glog/logging.h" // For VLOG() +#include "gflags/gflags.h" +#include "glog/logging.h" // For VLOG() #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/details/op_registry.h" #include "paddle/fluid/framework/grad_op_desc_maker.h" @@ -67,6 +68,8 @@ class Version; } // namespace framework } // namespace paddle +DECLARE_bool(check_kernel_launch); + namespace paddle { namespace framework { @@ -134,6 +137,19 @@ class OpRegistry { static std::unique_ptr CreateOp(const OpDesc& op_desc); }; +template +inline void CheckKernelLaunch(const char* op_type) {} + +#ifdef PADDLE_WITH_CUDA +template <> +inline void CheckKernelLaunch<::paddle::platform::CUDAPlace>( + const char* op_type) { + if (FLAGS_check_kernel_launch) { + PADDLE_ENFORCE_CUDA_LAUNCH_SUCCESS(op_type); + } +} +#endif + template struct OpKernelRegistrarFunctor; @@ -162,8 +178,9 @@ struct OpKernelRegistrarFunctor { RegisterKernelClass( op_type, library_type, customized_type_value, - [](const framework::ExecutionContext& ctx) { + [op_type](const framework::ExecutionContext& ctx) { KERNEL_TYPE().Compute(ctx); + CheckKernelLaunch(op_type); }); constexpr auto size = std::tuple_size>::value; OpKernelRegistrarFunctor @@ -223,8 +240,13 @@ struct OpKernelRegistrarFunctorEx(op_type, library_type, - customized_type_value, Functor()); + RegisterKernelClass( + op_type, library_type, customized_type_value, + + [op_type](const framework::ExecutionContext& ctx) { + Functor()(ctx); + CheckKernelLaunch(op_type); + }); constexpr auto size = std::tuple_size>::value; @@ -304,6 +326,9 @@ struct OpKernelRegistrarFunctorEx #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/op_version_proto.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 833a28a7579ca09279147c346abf5fffdc7f3324..955c917b2c1bf4119e13b5d8cdb813b036fbf587 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -47,9 +47,6 @@ DECLARE_bool(benchmark); DECLARE_bool(check_nan_inf); DECLARE_bool(enable_unused_var_check); DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op"); -DEFINE_bool(fast_check_nan_inf, false, - "Fast checking NAN/INF after each operation. It will be a little" - "bit slow, much faster than check_nan_inf"); namespace paddle { namespace framework { @@ -211,6 +208,16 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { #else auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device; platform::SetXPUDeviceId(dev_id); +#endif + } else if (platform::is_npu_place(place)) { +#ifndef PADDLE_WITH_ASCEND_CL + PADDLE_THROW(platform::errors::Unavailable( + "Cannot run operator on place %s, please recompile paddle or " + "reinstall Paddle with NPU support.", + place)); +#else + auto dev_id = BOOST_GET_CONST(platform::NPUPlace, place).device; + platform::SetNPUDeviceId(dev_id); #endif } @@ -1173,25 +1180,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, #endif } - if (FLAGS_fast_check_nan_inf) { - for (auto& vname : OutputVars(true)) { - // only check inserted vars, - // please see executor.py for details of fast_check_nan_inf - if (vname.rfind("debug_var") == 0) { - VLOG(3) << "debugging nan/inf in var " << vname; - - auto* var = exec_scope.FindVar(vname); - if (var == nullptr) continue; - if (var->IsType()) { - CheckTensorNANOrInf(type_, vname, var->Get()); - } else if (var->IsType()) { - CheckTensorNANOrInf(type_, vname, - var->Get().value()); - } - } - } - } - if (FLAGS_check_nan_inf) { framework::details::CheckOpHasNanOrInf(*this, exec_scope, place); } @@ -1248,7 +1236,8 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx, } } } - VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + VLOG(3) << "op type:" << type_ + << ", expected_kernel_key:" << expected_kernel_key; auto kernel_iter = kernels.find(expected_kernel_key); #ifdef PADDLE_WITH_MKLDNN @@ -1270,6 +1259,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx, expected_kernel_key.place_ = platform::CPUPlace(); kernel_iter = kernels.find(expected_kernel_key); } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + if (kernel_iter == kernels.end() && + is_npu_place(expected_kernel_key.place_)) { + VLOG(3) << "missing NPU kernel: " << type_ + << ", expected_kernel_key:" << expected_kernel_key + << ", fallbacking to CPU one!"; + expected_kernel_key.place_ = platform::CPUPlace(); + kernel_iter = kernels.find(expected_kernel_key); + } #endif PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), platform::errors::NotFound( diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index e9ecf9b5a8397880fb878babfe329701023984b1..3fc61581eca720f64d4b19fd70b9b619cea9fcef 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -27,7 +27,6 @@ limitations under the License. */ #include "glog/logging.h" // For VLOG #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_kernel_type.h" @@ -420,6 +419,7 @@ class ExecutionContext { const RuntimeContext Context() const { return ctx_; } std::string DebugString() const { return op_.DebugString(); } + const OperatorBase& GetOp() const { return op_; } private: const OperatorBase& op_; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 2f280d5cc4ae0b3056df553b19f47979c835f71e..eb021609e825839825b657ef516a18c5b4cbcc74 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -27,7 +27,6 @@ limitations under the License. */ #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" -#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" @@ -625,144 +624,21 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, const BuildStrategy &build_strategy, ir::Graph *graph) : member_(new ParallelExecutorPrivate(places, scope)) { + PADDLE_ENFORCE(places.size() > 0 && !is_npu_place(places[0]), + platform::errors::Unavailable( + "NPU is not supported in ParallelExecutor")); InitP2P(places); ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_), member_->places_.size()); - member_->use_device_ = exec_strategy.use_device_; - member_->build_strategy_ = build_strategy; - member_->use_all_reduce_ = member_->build_strategy_.reduce_ == - BuildStrategy::ReduceStrategy::kAllReduce; - member_->nranks_ = build_strategy.num_trainers_ * places.size(); - if (!member_->use_all_reduce_ && member_->nranks_ == 1) { - LOG(INFO) << "If you set build_strategy.reduce with 'Reduce'," - "the number of places should be greater than 1."; - member_->build_strategy_.reduce_ = - BuildStrategy::ReduceStrategy::kAllReduce; - member_->use_all_reduce_ = true; - } -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32) - if (member_->IsUseCUDA(member_->use_device_)) { - PADDLE_ENFORCE_EQ( - places.size(), 1, - platform::errors::Unavailable("Windows can support Single GPU only.")); - } -#endif - -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ - (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL)) - if (member_->IsUseCUDA(member_->use_device_)) { - PADDLE_ENFORCE_EQ( - places.size(), 1, - platform::errors::PermissionDenied( - "Your machine has multiple cards, " - "but the WITH_NCCL option is not turned on during compilation, " - "and you cannot use multi-card training or prediction. " - "Please recompile and turn on the WITH_NCCL option.")); - } -#endif - - std::string device_name; - if (member_->use_device_ == p::kCPU) { - device_name = "CPU"; - } else if (member_->use_device_ == p::kCUDA) { - device_name = "CUDA"; - } else { - device_name = "XPU"; - } - - VLOG(1) << string::Sprintf( - "The Program will be executed on %s using ParallelExecutor, %lu " - "cards are used, so %lu programs are executed in parallel.", - device_name, places.size(), places.size()); - - // Step 1. Bcast the bcast_vars to devs. - // Create local scopes - if (local_scopes.empty()) { - member_->own_local_scope_ = true; - member_->local_scopes_.emplace_back(member_->global_scope_); - for (size_t i = 1; i < member_->places_.size(); ++i) { - member_->local_scopes_.emplace_back(&scope->NewScope()); - } - } else { - member_->own_local_scope_ = false; - PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size(), - platform::errors::PreconditionNotMet( - "member_->places_.size() = %d is not equal to " - "local_scopes.size() = %d", - member_->places_.size(), local_scopes.size())); - for (size_t i = 0; i < member_->places_.size(); ++i) { - member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope()); - } - } - - std::vector graphs; - if (member_->build_strategy_.async_mode_) { - PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_), false, - platform::errors::Unavailable( - "gpu mode does not support async_mode_ now!")); - graphs.push_back(graph); - for (size_t i = 1; i < places.size(); ++i) { - auto *tmp_graph = new ir::Graph(graph->OriginProgram()); - async_graphs_.emplace_back(tmp_graph); - graphs.push_back(tmp_graph); - } - } - - // FIXME(Yancey1989): parallel graph mode get better performance - // in GPU allreduce distributed training. Need an elegant way to - // choice the execution strategy. - member_->build_strategy_.enable_parallel_graph_ = - EnableParallelGraphExecution(*graph, exec_strategy, - member_->build_strategy_); - if (member_->build_strategy_.enable_parallel_graph_) { - LOG(INFO) << "The Executor would execute the graph by ParallelGraph " - "Execution which can get better performance," - << "you can force it off by env FLAGS_enable_parallel_graph=0"; - } + // Initialize necessary info of member_ with strategy. + InitExecutorPrivateMemberInfo(exec_strategy, build_strategy, places.size(), + *graph); - if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_); - - // Initialize device context's nccl comm, will be used by normal - // Operators like sync_batch_norm, and collective ops. - // NOTE: more than one ParallelExecutor with same place, the nccl comm will - // be rewrite and there will be some problem. - // NOTE: NCCL group-calls and non-group-calls can not use the same - // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use - // same communicators. - auto *nccl_ctxs = - member_->nccl_ctxs_->GetSyncBatchNormCtx(scope, member_->places_); - auto &pool = platform::DeviceContextPool::Instance(); - for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { - auto *dev_ctx = static_cast( - pool.Get(member_->places_[dev_id])); - auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]); - dev_ctx->set_nccl_comm(nccl_ctx.comm()); - } -#else - PADDLE_THROW( - platform::errors::PreconditionNotMet("Not compiled with CUDA.")); -#endif - } - if (member_->use_device_ == p::kXPU && member_->nranks_ > 1) { -#if defined(PADDLE_WITH_XPU_BKCL) - member_->InitOrGetBKCLCommunicator(scope, member_->build_strategy_); + // Step 1. Create local scopes and Clone graph into multi device + CreateLocalScopes(scope, local_scopes, /*create_new*/ true); + std::vector graphs = CloneGraphToMultiDevices(graph); + PrepareNCCLCommunicator(scope); - auto *bkcl_ctxs = - member_->bkcl_ctxs_->GetSyncBatchNormCtx(scope, member_->places_); - auto &pool = platform::DeviceContextPool::Instance(); - for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { - auto *dev_ctx = static_cast( - pool.Get(member_->places_[dev_id])); - auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]); - dev_ctx->set_bkcl_context(bkcl_ctx.comm()); - } -#else - PADDLE_THROW( - platform::errors::PreconditionNotMet("Not compiled with XPU.")); -#endif - } // broadcast parameters from the 0th device to others: auto need_broadcast = [&]() -> bool { if (member_->build_strategy_.num_trainers_ > 1) { @@ -775,257 +651,75 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, } return false; }; - // Bcast Parameters to all GPUs if (need_broadcast()) { BCastParamsToDevices(bcast_vars, member_->build_strategy_.trainer_id_); } - // Startup Program has been run. All local scopes has correct parameters. - // Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp - std::vector async_graphs(places.size()); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - if (member_->build_strategy_.async_mode_) { - VLOG(3) << "use local async mode"; - graph = member_->build_strategy_.Apply( - graph, {member_->places_[0]}, loss_var_name, - {member_->local_scopes_[0]}, 1, member_->use_device_, - member_->nccl_ctxs_); - for (size_t i = 1; i < member_->places_.size(); ++i) { - graphs[i] = member_->build_strategy_.Apply( - graphs[i], {member_->places_[i]}, loss_var_name, - {member_->local_scopes_[i]}, 1, member_->use_device_, - member_->nccl_ctxs_); - async_graphs[i] = graphs[i]; - } - } else { - graph = member_->build_strategy_.Apply( - graph, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_device_, member_->nccl_ctxs_); - } -#elif defined(PADDLE_WITH_XPU_BKCL) - if (member_->build_strategy_.async_mode_) { - VLOG(3) << "use local async mode"; - graph = member_->build_strategy_.Apply( - graph, {member_->places_[0]}, loss_var_name, - {member_->local_scopes_[0]}, 1, member_->use_device_, - member_->bkcl_ctxs_); - for (size_t i = 1; i < member_->places_.size(); ++i) { - graphs[i] = member_->build_strategy_.Apply( - graphs[i], {member_->places_[i]}, loss_var_name, - {member_->local_scopes_[i]}, 1, member_->use_device_, - member_->bkcl_ctxs_); - async_graphs[i] = graphs[i]; - } - } else { - graph = member_->build_strategy_.Apply( - graph, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_device_, member_->bkcl_ctxs_); - } -#else - if (member_->build_strategy_.async_mode_) { - VLOG(3) << "use local async mode"; - graph = member_->build_strategy_.Apply( - graph, {member_->places_[0]}, loss_var_name, - {member_->local_scopes_[0]}, 1, member_->use_device_); - for (size_t i = 1; i < member_->places_.size(); ++i) { - graphs[i] = member_->build_strategy_.Apply( - graphs[i], {member_->places_[i]}, loss_var_name, - {member_->local_scopes_[i]}, 1, member_->use_device_); - async_graphs[i] = graphs[i]; - } - } else { - graph = member_->build_strategy_.Apply( - graph, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_device_); - } -#endif - + std::vector async_graphs = + CompileGraphWithBuildStrategy(graph, &graphs, loss_var_name); graph = member_->ApplyMemoryOptimizePass(graph); - async_graphs[0] = graph; // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars std::vector var_infos; - for (auto &node : graph->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - var_infos.emplace_back(); - var_infos.back().name_ = node->Var()->Name(); - var_infos.back().type_ = node->Var()->GetType(); - var_infos.back().persistable_ = node->Var()->Persistable(); - - member_->is_persistable_.emplace(node->Var()->Name(), - node->Var()->Persistable()); - } - } - - if (graph->Has(details::kFusedVars)) { - auto &fused_vars = graph->Get(details::kFusedVars); - for (auto &fused_var : fused_vars) { - var_infos.emplace_back(); - var_infos.back() = fused_var.second; + CreateVariableInfos(&var_infos, graph); + std::unordered_map scope_map = + CreateLocalExecScopes(member_->local_scopes_, /*create_new*/ true); - member_->is_persistable_.emplace(fused_var.first, - fused_var.second.persistable_); - } - } + // Step 4. Create SSAGraph executor + std::vector final_graphs = + CreateSSAGraphExecutor(exec_strategy, &async_graphs, graph); - std::unordered_map scope_map; - for (auto *scope : member_->local_scopes_) { - auto &local_exec_scope = scope->NewScope(); - member_->local_exec_scopes_.emplace_back(&local_exec_scope); - scope_map.emplace(scope, &local_exec_scope); + VLOG(3) << "use ScopeBufferedSSAGraphExecutor"; + if (!member_->build_strategy_.async_mode_) { + member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + std::move(var_infos), member_->places_, std::move(member_->executor_))); } - PADDLE_ENFORCE_EQ( - member_->local_scopes_.size(), member_->local_exec_scopes_.size(), - platform::errors::PreconditionNotMet( - "member_->local_scopes_.size() = %d is not equal to " - "member_->local_exec_scopes_.size() = %d", - member_->local_scopes_.size(), member_->local_exec_scopes_.size())); + ResetOpHandleScopeMapOfGraphs(final_graphs, scope_map); + SetReaderOpDeviceInfoOfGraphs(final_graphs); +} - std::vector final_graphs; +void ParallelExecutor::BCastParamsToDevices( + const std::vector &vars, int trainer_id) const { + VLOG(3) << "BCastParamsToDevices"; + // the initializing bcast, all vars would be bcast from device(0). + for (auto &var : vars) { + framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var); + if (main_var == nullptr || !main_var->IsType()) { + continue; + } - if (member_->build_strategy_.async_mode_) { - VLOG(3) << "use AsyncSSAGraphExecutor"; - member_->executor_.reset(new details::AsyncSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - member_->places_, async_graphs)); - final_graphs = async_graphs; - } else if (member_->build_strategy_.enable_parallel_graph_) { - VLOG(3) << "use ParallelSSAGraphExecutor"; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - // TODO(Yancey1989): Remove passing in the main_program when - // allreduce_seq_pass doesn't need it as the attr. - bool is_inference = details::IsDataParallelInferenceGraph(*graph); - bool has_drop_last_read_op = details::HasDropLastReadOp(*graph); + auto &main_tensor = main_var->Get(); + if (!main_tensor.IsInitialized()) { + VLOG(3) << "one in var not inited, return!"; + continue; + } + auto &dims = main_tensor.dims(); + if (paddle::platform::is_gpu_place(main_tensor.place())) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + std::vector buffers; + buffers.reserve(member_->places_.size()); + size_t numel = main_tensor.numel(); + ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); + for (size_t i = 0; i < member_->places_.size(); ++i) { + auto place = member_->places_[i]; + void *buffer; - auto *pg_exe = new details::ParallelSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - member_->places_, graph); - final_graphs = pg_exe->Graphs(); - member_->executor_.reset(pg_exe); - - if (is_inference && member_->places_.size() > 1) { - member_->inference_executor_ = pg_exe; - if (!has_drop_last_read_op) { - VLOG(5) << "Enable partial feed support in inference phase"; - pg_exe->EnablePartialFeedSupport(); - } - } -#else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Paddle should be compiled with CUDA for ParallelGraph Execution.")); -#endif - } else { - bool has_drop_last_read_op = details::HasDropLastReadOp(*graph); - auto possible_inference_graphs = - details::TrySeparateToMultipleSingleDeviceGraphs(graph); - if (!possible_inference_graphs.empty()) { - VLOG(5) << "Use ParallelSSAGraphExecutor in inference phase"; - auto *pg_exe = new details::ParallelSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - member_->places_, std::move(possible_inference_graphs)); - if (!has_drop_last_read_op) { - VLOG(5) << "Enable partial feed support in inference phase"; - pg_exe->EnablePartialFeedSupport(); - } - final_graphs = pg_exe->Graphs(); - member_->executor_.reset(pg_exe); - member_->inference_executor_ = pg_exe; - } else { - LOG_IF(WARNING, details::HasKeepLastReadOp(*graph)) - << "drop_last=False for DataLoader is not supported in training " - "network. It is automatically turned to drop_last=True."; - if (exec_strategy.type_ == ExecutionStrategy::kDefault) { - VLOG(3) << "use ThreadedSSAGraphExecutor"; - member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - member_->places_, graph)); - } else { - if (member_->use_device_ == p::kXPU) { -#if defined(PADDLE_WITH_XPU) - VLOG(3) << "use BindThreadedSSAGraphExecutor"; - member_->executor_.reset(new details::BindThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, - member_->local_exec_scopes_, member_->places_, graph)); -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "Paddle can't use XPU device since it's not compiled with XPU," - "Please recompile or reinstall Paddle with XPU support.")); -#endif - } else { - VLOG(3) << "use FastThreadedSSAGraphExecutor"; - member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, - member_->local_exec_scopes_, member_->places_, graph)); - } - } - final_graphs.emplace_back(graph); - } - } - - VLOG(3) << "use ScopeBufferedSSAGraphExecutor"; - if (!member_->build_strategy_.async_mode_) { - member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - std::move(var_infos), member_->places_, std::move(member_->executor_))); - } - - for (auto *g : final_graphs) { - auto ops = ir::FilterByNodeWrapper(*g); - for (auto *op : ops) { - op->SetLocalExecScopes(scope_map); - } - } - - if (final_graphs.size() == 1) { - ir::SetReaderOpDeviceInfo(final_graphs[0], member_->places_.size()); - } else { - for (size_t i = 0; i < final_graphs.size(); ++i) { - ir::SetReaderOpDeviceInfo(final_graphs[i], member_->places_.size(), i); - } - } -} - -void ParallelExecutor::BCastParamsToDevices( - const std::vector &vars, int trainer_id) const { - VLOG(3) << "BCastParamsToDevices"; - // the initializing bcast, all vars would be bcast from device(0). - for (auto &var : vars) { - framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var); - if (main_var == nullptr || !main_var->IsType()) { - continue; - } - - auto &main_tensor = main_var->Get(); - if (!main_tensor.IsInitialized()) { - VLOG(3) << "one in var not inited, return!"; - continue; - } - auto &dims = main_tensor.dims(); - if (paddle::platform::is_gpu_place(main_tensor.place())) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - std::vector buffers; - buffers.reserve(member_->places_.size()); - size_t numel = main_tensor.numel(); - ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); - for (size_t i = 0; i < member_->places_.size(); ++i) { - auto place = member_->places_[i]; - void *buffer; - - if (i == 0 && trainer_id == 0) { - buffer = const_cast(main_tensor.data()); - } else { - auto local_scope = member_->local_scopes_[i]; - auto *t = local_scope->Var(var)->GetMutable(); - t->Resize(dims); - buffer = t->mutable_data(place, main_tensor.type()); - } - buffers.push_back(buffer); - } + if (i == 0 && trainer_id == 0) { + buffer = const_cast(main_tensor.data()); + } else { + auto local_scope = member_->local_scopes_[i]; + auto *t = local_scope->Var(var)->GetMutable(); + t->Resize(dims); + buffer = t->mutable_data(place, main_tensor.type()); + } + buffers.push_back(buffer); + } PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(), platform::errors::PreconditionNotMet( @@ -1364,6 +1058,412 @@ bool ParallelExecutor::EnableParallelGraphExecution( return enable_parallel_graph; } +void ParallelExecutor::InitExecutorPrivateMemberInfo( + const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy, + size_t device_count, const ir::Graph &graph) { + member_->use_device_ = exec_strategy.use_device_; + member_->build_strategy_ = build_strategy; + member_->use_all_reduce_ = member_->build_strategy_.reduce_ == + BuildStrategy::ReduceStrategy::kAllReduce; + member_->nranks_ = build_strategy.num_trainers_ * device_count; + if (!member_->use_all_reduce_ && member_->nranks_ == 1) { + LOG(INFO) << "If you set build_strategy.reduce with 'Reduce'," + "the number of places should be greater than 1."; + member_->build_strategy_.reduce_ = + BuildStrategy::ReduceStrategy::kAllReduce; + member_->use_all_reduce_ = true; + } +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32) + if (member_->IsUseCUDA(member_->use_device_)) { + PADDLE_ENFORCE_EQ( + device_count, 1, + platform::errors::Unavailable("Windows can support Single GPU only.")); + } +#endif + +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ + (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL)) + if (member_->IsUseCUDA(member_->use_device_)) { + PADDLE_ENFORCE_EQ( + device_count, 1, + platform::errors::PermissionDenied( + "Your machine has multiple cards, " + "but the WITH_NCCL option is not turned on during compilation, " + "and you cannot use multi-card training or prediction. " + "Please recompile and turn on the WITH_NCCL option.")); + } +#endif + + std::string device_name; + if (member_->use_device_ == p::kCPU) { + device_name = "CPU"; + } else if (member_->use_device_ == p::kCUDA) { + device_name = "CUDA"; + } else { + device_name = "XPU"; + } + + VLOG(1) << string::Sprintf( + "The Program will be executed on %s using ParallelExecutor, %lu " + "cards are used, so %lu programs are executed in parallel.", + device_name, device_count, device_count); + + // FIXME(Yancey1989): parallel graph mode get better performance + // in GPU allreduce distributed training. Need an elegant way to + // choice the execution strategy. + member_->build_strategy_.enable_parallel_graph_ = + EnableParallelGraphExecution(graph, exec_strategy, + member_->build_strategy_); + if (member_->build_strategy_.enable_parallel_graph_) { + LOG(INFO) << "The Executor would execute the graph by ParallelGraph " + "Execution which can get better performance," + << "you can force it off by env FLAGS_enable_parallel_graph=0"; + } +} + +void ParallelExecutor::CreateLocalScopes( + Scope *global_scope, const std::vector &local_scopes, + bool create_new) { + if (local_scopes.empty()) { + member_->own_local_scope_ = true; + member_->local_scopes_.emplace_back(global_scope); + for (size_t i = 1; i < member_->places_.size(); ++i) { + member_->local_scopes_.emplace_back(&global_scope->NewScope()); + } + } else { + member_->own_local_scope_ = false; + PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size(), + platform::errors::PreconditionNotMet( + "member_->places_.size() = %d is not equal to " + "local_scopes.size() = %d", + member_->places_.size(), local_scopes.size())); + for (size_t i = 0; i < member_->places_.size(); ++i) { + if (create_new) { + member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope()); + } else { + // Use local scopes directly + member_->local_scopes_.emplace_back(local_scopes[i]); + } + } + } +} + +std::unordered_map ParallelExecutor::CreateLocalExecScopes( + const std::vector &local_scopes, bool create_new) { + std::unordered_map scope_map; + + for (auto *scope : local_scopes) { + Scope *local_exec_scope = scope; + if (create_new) { + local_exec_scope = &scope->NewScope(); + } + member_->local_exec_scopes_.emplace_back(local_exec_scope); + scope_map.emplace(scope, local_exec_scope); + } + + PADDLE_ENFORCE_EQ( + member_->local_scopes_.size(), member_->local_exec_scopes_.size(), + platform::errors::PreconditionNotMet( + "member_->local_scopes_.size() = %d is not equal to " + "member_->local_exec_scopes_.size() = %d", + member_->local_scopes_.size(), member_->local_exec_scopes_.size())); + + return scope_map; +} + +std::vector ParallelExecutor::CloneGraphToMultiDevices( + ir::Graph *graph) { + std::vector graphs; + if (member_->build_strategy_.async_mode_) { + PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_), false, + platform::errors::Unavailable( + "gpu mode does not support async_mode_ now!")); + graphs.push_back(graph); + for (size_t i = 1; i < member_->places_.size(); ++i) { + auto *tmp_graph = new ir::Graph(graph->OriginProgram()); + async_graphs_.emplace_back(tmp_graph); + graphs.push_back(tmp_graph); + } + } + + return graphs; +} + +void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) { + if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + member_->InitOrGetNCCLCommunicator(global_scope, &member_->build_strategy_); + + // Initialize device context's nccl comm, will be used by normal + // Operators like sync_batch_norm, and collective ops. + // NOTE: more than one ParallelExecutor with same place, the nccl comm will + // be rewrite and there will be some problem. + // NOTE: NCCL group-calls and non-group-calls can not use the same + // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use + // same communicators. + auto *nccl_ctxs = member_->nccl_ctxs_->GetSyncBatchNormCtx( + global_scope, member_->places_); + auto &pool = platform::DeviceContextPool::Instance(); + for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { + auto *dev_ctx = static_cast( + pool.Get(member_->places_[dev_id])); + auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]); + dev_ctx->set_nccl_comm(nccl_ctx.comm()); + } +#else + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with CUDA.")); +#endif + } + if (member_->use_device_ == p::kXPU && member_->nranks_ > 1) { +#if defined(PADDLE_WITH_XPU_BKCL) + member_->InitOrGetBKCLCommunicator(global_scope, member_->build_strategy_); + + auto *bkcl_ctxs = member_->bkcl_ctxs_->GetSyncBatchNormCtx( + global_scope, member_->places_); + auto &pool = platform::DeviceContextPool::Instance(); + for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { + auto *dev_ctx = static_cast( + pool.Get(member_->places_[dev_id])); + auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]); + dev_ctx->set_bkcl_context(bkcl_ctx.comm()); + } +#else + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with XPU.")); +#endif + } +} + +std::vector ParallelExecutor::CompileGraphWithBuildStrategy( + ir::Graph *graph, std::vector *device_graphs, + const std::string &loss_var_name) { + auto device_count = member_->places_.size(); + std::vector async_graphs(device_count); + + auto &graphs = *device_graphs; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (member_->build_strategy_.async_mode_) { + PADDLE_ENFORCE_EQ(graphs.size(), device_count, + platform::errors::PreconditionNotMet( + "graphs.size() shoule be %d, but received %d", + device_count, graphs.size())); + VLOG(3) << "use local async mode"; + graph = member_->build_strategy_.Apply( + graph, {member_->places_[0]}, loss_var_name, + {member_->local_scopes_[0]}, 1, member_->use_device_, + member_->nccl_ctxs_); + for (size_t i = 1; i < device_count; ++i) { + graphs[i] = member_->build_strategy_.Apply( + graphs[i], {member_->places_[i]}, loss_var_name, + {member_->local_scopes_[i]}, 1, member_->use_device_, + member_->nccl_ctxs_); + async_graphs[i] = graphs[i]; + } + } else { + graph = member_->build_strategy_.Apply( + graph, member_->places_, loss_var_name, member_->local_scopes_, + member_->nranks_, member_->use_device_, member_->nccl_ctxs_); + } +#elif defined(PADDLE_WITH_XPU_BKCL) + if (member_->build_strategy_.async_mode_) { + PADDLE_ENFORCE_EQ(graphs.size(), device_count, + platform::errors::PreconditionNotMet( + "graphs.size() shoule be %d, but received %d", + device_count, graphs.size())); + VLOG(3) << "use local async mode"; + graph = member_->build_strategy_.Apply( + graph, {member_->places_[0]}, loss_var_name, + {member_->local_scopes_[0]}, 1, member_->use_device_, + member_->bkcl_ctxs_); + for (size_t i = 1; i < device_count; ++i) { + graphs[i] = member_->build_strategy_.Apply( + graphs[i], {member_->places_[i]}, loss_var_name, + {member_->local_scopes_[i]}, 1, member_->use_device_, + member_->bkcl_ctxs_); + async_graphs[i] = graphs[i]; + } + } else { + graph = member_->build_strategy_.Apply( + graph, member_->places_, loss_var_name, member_->local_scopes_, + member_->nranks_, member_->use_device_, member_->bkcl_ctxs_); + } +#else + if (member_->build_strategy_.async_mode_) { + VLOG(3) << "use local async mode"; + graph = member_->build_strategy_.Apply( + graph, {member_->places_[0]}, loss_var_name, + {member_->local_scopes_[0]}, 1, member_->use_device_); + for (size_t i = 1; i < device_count; ++i) { + graphs[i] = member_->build_strategy_.Apply( + graphs[i], {member_->places_[i]}, loss_var_name, + {member_->local_scopes_[i]}, 1, member_->use_device_); + async_graphs[i] = graphs[i]; + } + } else { + graph = member_->build_strategy_.Apply( + graph, member_->places_, loss_var_name, member_->local_scopes_, + member_->nranks_, member_->use_device_); + } +#endif + + return async_graphs; +} + +void ParallelExecutor::CreateVariableInfos( + std::vector *var_infos, ir::Graph *graph) { + PADDLE_ENFORCE_EQ( + var_infos->size(), 0, + platform::errors::PreconditionNotMet( + "var_infos->size() shoule be 0, but received %d", var_infos->size())); + PADDLE_ENFORCE_EQ( + member_->is_persistable_.size(), 0, + platform::errors::PreconditionNotMet( + "member_->is_persistable_.size() shoule be 0, but received %d", + member_->is_persistable_.size())); + for (auto &node : graph->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos->emplace_back(); + var_infos->back().name_ = node->Var()->Name(); + var_infos->back().type_ = node->Var()->GetType(); + var_infos->back().persistable_ = node->Var()->Persistable(); + + member_->is_persistable_.emplace(node->Var()->Name(), + node->Var()->Persistable()); + } + } + + if (graph->Has(details::kFusedVars)) { + auto &fused_vars = graph->Get(details::kFusedVars); + for (auto &fused_var : fused_vars) { + var_infos->emplace_back(); + var_infos->back() = fused_var.second; + + member_->is_persistable_.emplace(fused_var.first, + fused_var.second.persistable_); + } + } +} + +std::vector ParallelExecutor::CreateSSAGraphExecutor( + const ExecutionStrategy &exec_strategy, + std::vector *async_graphs, ir::Graph *graph) { + std::vector final_graphs; + + if (member_->build_strategy_.async_mode_) { + VLOG(3) << "use AsyncSSAGraphExecutor"; + member_->executor_.reset(new details::AsyncSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + member_->places_, *async_graphs)); + final_graphs = *async_graphs; + } else if (member_->build_strategy_.enable_parallel_graph_) { + VLOG(3) << "use ParallelSSAGraphExecutor"; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // TODO(Yancey1989): Remove passing in the main_program when + // allreduce_seq_pass doesn't need it as the attr. + bool is_inference = details::IsDataParallelInferenceGraph(*graph); + bool has_drop_last_read_op = details::HasDropLastReadOp(*graph); + + auto *pg_exe = new details::ParallelSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + member_->places_, graph); + final_graphs = pg_exe->Graphs(); + member_->executor_.reset(pg_exe); + + if (is_inference && member_->places_.size() > 1) { + member_->inference_executor_ = pg_exe; + if (!has_drop_last_read_op) { + VLOG(5) << "Enable partial feed support in inference phase"; + pg_exe->EnablePartialFeedSupport(); + } + } +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "Paddle should be compiled with CUDA for ParallelGraph Execution.")); +#endif + } else { + bool has_drop_last_read_op = details::HasDropLastReadOp(*graph); + auto possible_inference_graphs = + details::TrySeparateToMultipleSingleDeviceGraphs(graph); + if (!possible_inference_graphs.empty()) { + VLOG(5) << "Use ParallelSSAGraphExecutor in inference phase"; + auto *pg_exe = new details::ParallelSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + member_->places_, std::move(possible_inference_graphs)); + if (!has_drop_last_read_op) { + VLOG(5) << "Enable partial feed support in inference phase"; + pg_exe->EnablePartialFeedSupport(); + } + final_graphs = pg_exe->Graphs(); + member_->executor_.reset(pg_exe); + member_->inference_executor_ = pg_exe; + } else { + LOG_IF(WARNING, details::HasKeepLastReadOp(*graph)) + << "drop_last=False for DataLoader is not supported in training " + "network. It is automatically turned to drop_last=True."; + if (exec_strategy.type_ == ExecutionStrategy::kDefault) { + VLOG(3) << "use ThreadedSSAGraphExecutor"; + member_->executor_.reset(new details::ThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + member_->places_, graph)); + } else { + if (member_->use_device_ == p::kXPU) { +#if defined(PADDLE_WITH_XPU) + VLOG(3) << "use BindThreadedSSAGraphExecutor"; + member_->executor_.reset(new details::BindThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, + member_->local_exec_scopes_, member_->places_, graph)); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't use XPU device since it's not compiled with XPU," + "Please recompile or reinstall Paddle with XPU support.")); +#endif + } else { + VLOG(3) << "use FastThreadedSSAGraphExecutor"; + member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, + member_->local_exec_scopes_, member_->places_, graph)); + } + } + final_graphs.emplace_back(graph); + } + } + return final_graphs; +} + +void ParallelExecutor::ResetOpHandleScopeMapOfGraphs( + const std::vector &final_graphs, + const std::unordered_map &scope_map) { + PADDLE_ENFORCE_GE( + final_graphs.size(), 1, + platform::errors::PreconditionNotMet( + "final_graphs shoule contain at least one graph, but received %d", + final_graphs.size())); + + PADDLE_ENFORCE_GT(scope_map.size(), 0, + platform::errors::PreconditionNotMet( + "scope_map shoule contain at least one " + "element, but received %d", + scope_map.size())); + for (auto *g : final_graphs) { + auto ops = ir::FilterByNodeWrapper(*g); + for (auto *op : ops) { + op->SetLocalExecScopes(scope_map); + } + } +} + +void ParallelExecutor::SetReaderOpDeviceInfoOfGraphs( + const std::vector &final_graphs) { + if (final_graphs.size() == 1) { + ir::SetReaderOpDeviceInfo(final_graphs[0], member_->places_.size()); + } else { + for (size_t i = 0; i < final_graphs.size(); ++i) { + ir::SetReaderOpDeviceInfo(final_graphs[i], member_->places_.size(), i); + } + } +} + const ir::Graph &ParallelExecutor::Graph() const { return member_->executor_->Graph(); } diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 47de7dc48f4f2cc7f12fd76be7e2b2b041bb7160..d4d0b534b55f05f1e4145064eb55e7858f25185d 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/op_info.h" @@ -41,6 +42,7 @@ namespace framework { class ParallelExecutorPrivate; +using details::VariableInfo; using details::BuildStrategy; using details::ExecutionStrategy; namespace p = paddle::platform; @@ -93,6 +95,40 @@ class ParallelExecutor { const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy) const; + void InitExecutorPrivateMemberInfo(const ExecutionStrategy &exec_strategy, + const BuildStrategy &build_strategy, + size_t device_count, + const ir::Graph &graph); + + void CreateLocalScopes(Scope *global_scope, + const std::vector &local_scopes, + bool create_new); + + std::unordered_map CreateLocalExecScopes( + const std::vector &local_scopes, bool create_new); + + std::vector CloneGraphToMultiDevices(ir::Graph *graph); + + void PrepareNCCLCommunicator(Scope *global_scope); + + std::vector CompileGraphWithBuildStrategy( + ir::Graph *graph, std::vector *graphs, + const std::string &loss_var_name); + + void CreateVariableInfos(std::vector *var_infos, + ir::Graph *graph); + + std::vector CreateSSAGraphExecutor( + const ExecutionStrategy &exec_strategy, + std::vector *async_graphs, ir::Graph *graph); + + void ResetOpHandleScopeMapOfGraphs( + const std::vector &final_graphs, + const std::unordered_map &scope_map); + + void SetReaderOpDeviceInfoOfGraphs( + const std::vector &final_graphs); + ParallelExecutorPrivate *member_; std::vector> async_graphs_; }; diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc index 8d350f70165b667f8ea5e152dce29e3dfac74ec0..3bd50229b94deb0e09f242d529c3225ec2e4408a 100644 --- a/paddle/fluid/framework/pipeline_trainer.cc +++ b/paddle/fluid/framework/pipeline_trainer.cc @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/trainer.h" @@ -24,6 +25,9 @@ namespace framework { void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, Dataset* dataset) { const auto& section_params = trainer_desc.section_param(); + const int num_pipeline_stages_ = section_params.num_pipeline_stages(); + const int pipeline_stage_ = section_params.pipeline_stage(); + const int schedule_mode_ = section_params.schedule_mode(); num_microbatches_ = section_params.num_microbatches(); VLOG(3) << "Number of microbatches per minibatch: " << num_microbatches_; trainer_desc_ = trainer_desc; @@ -31,7 +35,11 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, ParseDumpConfig(trainer_desc); const auto& section_config = section_params.section_config(); int place_id = section_config.place_id(); +#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_RCCL) place_ = platform::CUDAPlace(place_id); +#elif (defined PADDLE_WITH_ASCEND_CL) // NOLINT + place_ = platform::NPUPlace(place_id); +#endif worker_ = DeviceWorkerFactory::CreateDeviceWorker( trainer_desc.device_worker_name()); auto this_worker = @@ -39,6 +47,9 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, this_worker->SetPlace(place_); this_worker->Initialize(trainer_desc); this_worker->SetMicrobatchNum(num_microbatches_); + this_worker->SetPipelineStageNum(num_pipeline_stages_); + this_worker->SetPipelineStage(pipeline_stage_); + this_worker->SetScheduleMode(schedule_mode_); } void PipelineTrainer::InitOtherEnv(const ProgramDesc& main_program) { @@ -65,35 +76,16 @@ void PipelineTrainer::CopyParameters(int microbatch_id, const ProgramDesc& program, const platform::Place& place) { auto& global_block = program.Block(0); - std::map param_map; - for (auto& var : global_block.AllVars()) { - if (var->Persistable()) { - param_map[var->Name()] = 1; - } - } for (auto& var : global_block.AllVars()) { - bool is_param_grad = false; - size_t pos = 0; - if ((pos = var->Name().find(kGradVarSuffix)) != std::string::npos) { - auto prefix_name = var->Name().substr(0, pos); - if (param_map.find(prefix_name) != param_map.end()) { - is_param_grad = true; - } - } if (var->Persistable() && microbatch_id == 0) { auto* ptr = root_scope_->Var(var->Name()); InitializeVariable(ptr, var->GetType()); - VLOG(3) << "Create persistable var: " << var->Name() - << ", which pointer is " << ptr; - } else if (is_param_grad && microbatch_id == 0) { - auto* ptr = minibatch_scope_->Var(var->Name()); - InitializeVariable(ptr, var->GetType()); - VLOG(3) << "Create grad for persistable var: " << var->Name() + VLOG(5) << "Create persistable var: " << var->Name() << ", which pointer is " << ptr; - } else if (!var->Persistable() && !is_param_grad) { + } else if (!var->Persistable()) { auto* ptr = microbatch_scopes_[microbatch_id]->Var(var->Name()); - VLOG(3) << "Create variable " << var->Name() << " for microbatch " + VLOG(5) << "Create variable " << var->Name() << " for microbatch " << microbatch_id << ", which pointer is " << ptr; InitializeVariable(ptr, var->GetType()); } diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h index cfef80b8d3777817d767ba201e4dad85cf8bdc9c..4ceb0c5c8248143791238a7b9077e402a7c1b832 100644 --- a/paddle/fluid/framework/program_desc.h +++ b/paddle/fluid/framework/program_desc.h @@ -20,7 +20,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/proto_desc.h" #include "paddle/fluid/platform/macros.h" diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc index e77932fa5f226518f7be4177488d6cc55f2fce06..39bc3f040639bfb9271ed808c8905b4bb5e89a92 100644 --- a/paddle/fluid/framework/ps_gpu_trainer.cc +++ b/paddle/fluid/framework/ps_gpu_trainer.cc @@ -19,10 +19,6 @@ limitations under the License. */ #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/device_worker_factory.h" -#include "paddle/fluid/framework/fleet/fleet_wrapper.h" -#include "paddle/fluid/framework/fleet/heter_context.h" -#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" -#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/trainer.h" #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ (defined PADDLE_WITH_PSLIB) @@ -64,7 +60,6 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, pull_dense_worker_ = PullDenseWorker::GetInstance(); pull_dense_worker_->Initialize(trainer_desc); SetDebug(trainer_desc.debug()); - fleet_ptr_ = FleetWrapper::GetInstance(); trainer_desc_ = trainer_desc; workers_.resize(place_num); for (int i = 0; i < place_num; ++i) { diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc index 2597901d91f36bab7e6a1e3553d8c43bb7a686f8..66d8a40dda160752e64eae8775a2045509e575e3 100644 --- a/paddle/fluid/framework/ps_gpu_worker.cc +++ b/paddle/fluid/framework/ps_gpu_worker.cc @@ -14,8 +14,6 @@ limitations under the License. */ #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/device_worker_factory.h" -#include "paddle/fluid/framework/fleet/fleet_wrapper.h" -#include "paddle/fluid/framework/fleet/heter_wrapper.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/string/string_helper.h" @@ -130,8 +128,6 @@ void PSGPUWorker::Initialize(const TrainerDesc& desc) { } } } - // pull_queue_ = paddle::framework::MakeChannel>(); - // push_queue_ = paddle::framework::MakeChannel>(); } void PSGPUWorker::SetChannelWriter(ChannelObject* queue) { diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h index a4207deb7e8113ab07b8b7f9b227e121f3e0f1bc..e7c23eab1fa5fcab8137c12845beeb237d784e16 100644 --- a/paddle/fluid/framework/reader.h +++ b/paddle/fluid/framework/reader.h @@ -20,7 +20,6 @@ #include #include "paddle/fluid/framework/ddim.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index 735c86faf082b817ee0a0ff35d127112fe3058b4..00ff50abadd185eb6ac8907d89aaf455dd5a7f16 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -9,7 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_ASCEND_CL) #include #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/executor_gc_helper.h" @@ -22,15 +23,79 @@ class TrainerDesc; uint64_t SectionWorker::batch_id_(0); -void SectionWorker::Initialize(const TrainerDesc& desc) { +void SectionWorker::Initialize(const TrainerDesc &desc) { dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_); program_.reset( new ProgramDesc(desc.section_param().section_config().program_desc())); - for (auto& op_desc : program_->Block(0).AllOps()) { + for (auto &op_desc : program_->Block(0).AllOps()) { ops_.push_back(OpRegistry::CreateOp(*op_desc)); } } +void SectionWorker::RunForward( + int micro_id, std::unique_ptr &gc, + std::unordered_map> + &unused_vars_) { + for (auto &op : ops_) { + int op_role = op->Attr(std::string("op_role")); + // We run op with op_role = kLRSched only for the first microbatch + // to avoid increasing the @LR_DECAY_STEP@ multiple times. + bool run_first_mbatch = (op_role == static_cast(OpRole::kForward)) || + (op_role == (static_cast(OpRole::kForward) | + static_cast(OpRole::kLoss))) || + (op_role == static_cast(OpRole::kLRSched)); + bool run_others = (op_role == static_cast(OpRole::kForward)) || + (op_role == (static_cast(OpRole::kForward) | + static_cast(OpRole::kLoss))); + if ((micro_id == 0 && run_first_mbatch) || (micro_id != 0 && run_others)) { + VLOG(3) << "Forward: running op " << op->Type() << " for micro-batch " + << micro_id; + op->Run(*microbatch_scopes_[micro_id], place_); + if (gc) { + DeleteUnusedTensors(*microbatch_scopes_[micro_id], op.get(), + unused_vars_, gc.get()); + } + } + } +} + +void SectionWorker::RunBackward( + int micro_id, std::unique_ptr &gc, + std::unordered_map> + &unused_vars_) { + for (auto &op : ops_) { + int op_role = op->Attr(std::string("op_role")); + if ((op_role == static_cast(OpRole::kBackward)) || + (op_role == (static_cast(OpRole::kBackward) | + static_cast(OpRole::kLoss)))) { + VLOG(3) << "Backward: running op " << op->Type() << " for micro-batch " + << micro_id; + op->Run(*microbatch_scopes_[micro_id], place_); + if (gc) { + DeleteUnusedTensors(*microbatch_scopes_[micro_id], op.get(), + unused_vars_, gc.get()); + } + } + } +} + +void SectionWorker::RunUpdate( + std::unique_ptr &gc, + std::unordered_map> + &unused_vars_) { + for (auto &op : ops_) { + int op_role = op->Attr(std::string("op_role")); + if (op_role == static_cast(OpRole::kOptimize)) { + VLOG(3) << "Update: running op " << op->Type(); + op->Run(*microbatch_scopes_[num_microbatches_ - 1], place_); + if (gc) { + DeleteUnusedTensors(*microbatch_scopes_[num_microbatches_ - 1], + op.get(), unused_vars_, gc.get()); + } + } + } +} + void SectionWorker::TrainFiles() { VLOG(5) << "begin section_worker TrainFiles"; @@ -48,69 +113,56 @@ void SectionWorker::TrainFiles() { #endif } - for (int i = 0; i < num_microbatches_; ++i) { - for (auto& op : ops_) { - int op_role = op->Attr(std::string("op_role")); - // We run op with op_role = kLRSched only for the first microbatch - // to avoid increasing the @LR_DECAY_STEP@ multiple times. - bool run_first_mbatch = op_role == static_cast(OpRole::kForward) || - op_role == (static_cast(OpRole::kForward) | - static_cast(OpRole::kLoss)) || - op_role == static_cast(OpRole::kLRSched); - bool run_others = op_role == static_cast(OpRole::kForward) || - op_role == (static_cast(OpRole::kForward) | - static_cast(OpRole::kLoss)); - if ((i == 0 && run_first_mbatch) || (i != 0 && run_others)) { - VLOG(3) << "Forward: running op " << op->Type() << " for micro-batch " - << i; - op->Run(*microbatch_scopes_[i], place_); - if (gc) { - DeleteUnusedTensors(*microbatch_scopes_[i], op.get(), unused_vars_, - gc.get()); - } - } + if (schedule_mode_ == 0) { + // F-then-B scheduler which runs Forward phase for all microbatches, + // then runs Backward phase for all microbatches. + // step1: run forward + for (int i = 0; i < num_microbatches_; ++i) { + RunForward(i, gc, unused_vars_); } -#ifdef PADDLE_WITH_RCCL - hipDeviceSynchronize(); -#else - cudaDeviceSynchronize(); -#endif - } - - // backward pass - for (int i = 0; i < num_microbatches_; ++i) { - for (auto& op : ops_) { - int op_role = op->Attr(std::string("op_role")); - if (op_role == static_cast(OpRole::kBackward) || - op_role == (static_cast(OpRole::kBackward) | - static_cast(OpRole::kLoss))) { - VLOG(3) << "Backward: running op " << op->Type() << " for micro-batch " - << i; - op->Run(*microbatch_scopes_[i], place_); - if (gc) { - DeleteUnusedTensors(*microbatch_scopes_[i], op.get(), unused_vars_, - gc.get()); - } - } + // step2: run backward + for (int i = 0; i < num_microbatches_; ++i) { + RunBackward(i, gc, unused_vars_); + } + // step3: run update + RunUpdate(gc, unused_vars_); + } else { + // 1F1B scheduler, which runs forward phase and backward phase altertively + // after startup phase. For a stage, the number of microbatches for + // startup is num_pipeline_stages_ - pipeline_stage_ - 1, where + // num_pipeline_stages_ is the total number of pipeline stages and + // pipeline_stage_ is the pipeline stage of the current device. + auto startup_steps = num_pipeline_stages_ - pipeline_stage_ - 1; + VLOG(3) << "startup_steps:" << startup_steps + << ", num_stages: " << num_pipeline_stages_ + << ", stage:" << pipeline_stage_; + PADDLE_ENFORCE_GT( + num_microbatches_, startup_steps, + platform::errors::InvalidArgument( + "To use pipeline with 1F1B scheduler, please make sure number of " + "microbatches (%d) is than startup steps (%d).", + num_microbatches_, startup_steps)); + int fw_step = 0; + int bw_step = 0; + // startup phase + while (fw_step < startup_steps) { + RunForward(fw_step, gc, unused_vars_); + fw_step += 1; } -#ifdef PADDLE_WITH_RCCL - hipDeviceSynchronize(); -#else - cudaDeviceSynchronize(); -#endif - } - // update pass - for (auto& op : ops_) { - int op_role = op->Attr(std::string("op_role")); - if (op_role == static_cast(OpRole::kOptimize)) { - VLOG(3) << "Update: running op " << op->Type(); - op->Run(*microbatch_scopes_[0], place_); - if (gc) { - DeleteUnusedTensors(*microbatch_scopes_[0], op.get(), unused_vars_, - gc.get()); - } + // 1f1b phase + while (fw_step < num_microbatches_) { + RunForward(fw_step, gc, unused_vars_); + fw_step += 1; + RunBackward(bw_step, gc, unused_vars_); + bw_step += 1; + } + // backward phase + while (bw_step < num_microbatches_) { + RunBackward(bw_step, gc, unused_vars_); + bw_step += 1; } + RunUpdate(gc, unused_vars_); } dev_ctx_->Wait(); ++batch_id_; diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc index 4c30c40ad58375fb08f23e2c7bdef27fdaea7384..7e48d0dc5f96203c4bc89f954b82dfa582eddbc9 100644 --- a/paddle/fluid/framework/selected_rows.cc +++ b/paddle/fluid/framework/selected_rows.cc @@ -113,6 +113,21 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, TensorToStream(os, selected_rows.value(), dev_ctx); } +void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + const platform::DeviceContext* dev_ctx; + auto place = selected_rows.place(); + dev_ctx = pool.Get(place); + SerializeToStream(os, selected_rows, *dev_ctx); +} + +void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + const platform::DeviceContext* dev_ctx; + dev_ctx = pool.Get(platform::CPUPlace()); + DeserializeFromStream(os, selected_rows, *dev_ctx); +} + void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows, const platform::DeviceContext& dev_ctx) { { diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h index 48353b43f56cacbb71512b9e743af281b09fc531..e53e3d973c524657a7b579d96d0f51a39ba40f12 100644 --- a/paddle/fluid/framework/selected_rows.h +++ b/paddle/fluid/framework/selected_rows.h @@ -173,5 +173,9 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows, const platform::DeviceContext& dev_ctx); +void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows); + +void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index 54f779813063362956130ec9314365f89a234c1e..101463756c0a5143536362c706ae08333673c831 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -125,25 +125,54 @@ TEST(Tensor, MutableData) { float* p2 = nullptr; // initialization p1 = src_tensor.mutable_data(framework::make_ddim({1, 2, 3}), - platform::CUDAPlace()); + platform::CUDAPlace(0)); auto p1_holder = src_tensor.Holder(); EXPECT_NE(p1, nullptr); // set src_tensor a new dim with large size // momery is supposed to be re-allocated p2 = src_tensor.mutable_data(framework::make_ddim({3, 1024}), - platform::CUDAPlace()); + platform::CUDAPlace(0)); auto p2_holder = src_tensor.Holder(); EXPECT_NE(p2, nullptr); EXPECT_NE(p1_holder.get(), p2_holder.get()); // set src_tensor a new dim with same size // momery block is supposed to be unchanged p1 = src_tensor.mutable_data(framework::make_ddim({2, 2, 3}), - platform::CUDAPlace()); + platform::CUDAPlace(0)); EXPECT_EQ(p1, p2); // set src_tensor a new dim with smaller size // momery block is supposed to be unchanged p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), - platform::CUDAPlace()); + platform::CUDAPlace(0)); + EXPECT_EQ(p1, p2); + } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + { + framework::Tensor src_tensor; + float* p1 = nullptr; + float* p2 = nullptr; + // initialization + p1 = src_tensor.mutable_data(framework::make_ddim({1, 2, 3}), + platform::NPUPlace(0)); + auto p1_holder = src_tensor.Holder(); + EXPECT_NE(p1, nullptr); + // set src_tensor a new dim with large size + // momery is supposed to be re-allocated + p2 = src_tensor.mutable_data(framework::make_ddim({3, 1024}), + platform::NPUPlace(0)); + auto p2_holder = src_tensor.Holder(); + EXPECT_NE(p2, nullptr); + EXPECT_NE(p1_holder.get(), p2_holder.get()); + // set src_tensor a new dim with same size + // momery block is supposed to be unchanged + p1 = src_tensor.mutable_data(framework::make_ddim({2, 2, 3}), + platform::NPUPlace(0)); + EXPECT_EQ(p1, p2); + // set src_tensor a new dim with smaller size + // momery block is supposed to be unchanged + p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), + platform::NPUPlace(0)); EXPECT_EQ(p1, p2); } #endif @@ -179,7 +208,17 @@ TEST(Tensor, ShareDataWith) { framework::Tensor src_tensor; framework::Tensor dst_tensor; src_tensor.mutable_data(framework::make_ddim({2, 3, 4}), - platform::CUDAPlace()); + platform::CUDAPlace(0)); + dst_tensor.ShareDataWith(src_tensor); + ASSERT_EQ(src_tensor.data(), dst_tensor.data()); + } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + { + framework::Tensor src_tensor; + framework::Tensor dst_tensor; + src_tensor.mutable_data(framework::make_ddim({2, 3, 4}), + platform::NPUPlace(0)); dst_tensor.ShareDataWith(src_tensor); ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } @@ -216,7 +255,34 @@ TEST(Tensor, Slice) { { framework::Tensor src_tensor; src_tensor.mutable_data(framework::make_ddim({6, 9}), - platform::CUDAPlace()); + platform::CUDAPlace(0)); + framework::Tensor slice_tensor = src_tensor.Slice(2, 6); + framework::DDim slice_dims = slice_tensor.dims(); + ASSERT_EQ(arity(slice_dims), 2); + EXPECT_EQ(slice_dims[0], 4); + EXPECT_EQ(slice_dims[1], 9); + + uintptr_t src_data_address = + reinterpret_cast(src_tensor.data()); + uintptr_t src_mutable_data_address = + reinterpret_cast(src_tensor.mutable_data( + src_tensor.dims(), platform::CUDAPlace(0))); + uintptr_t slice_data_address = + reinterpret_cast(slice_tensor.data()); + uintptr_t slice_mutable_data_address = + reinterpret_cast(slice_tensor.mutable_data( + slice_tensor.dims(), platform::CUDAPlace(0))); + EXPECT_EQ(src_data_address, src_mutable_data_address); + EXPECT_EQ(slice_data_address, slice_mutable_data_address); + EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); + } +#endif + +#ifdef PADDLE_WITH_ASCEND_CL + { + framework::Tensor src_tensor; + src_tensor.mutable_data(framework::make_ddim({6, 9}), + platform::NPUPlace(0)); framework::Tensor slice_tensor = src_tensor.Slice(2, 6); framework::DDim slice_dims = slice_tensor.dims(); ASSERT_EQ(arity(slice_dims), 2); @@ -227,12 +293,12 @@ TEST(Tensor, Slice) { reinterpret_cast(src_tensor.data()); uintptr_t src_mutable_data_address = reinterpret_cast(src_tensor.mutable_data( - src_tensor.dims(), platform::CUDAPlace())); + src_tensor.dims(), platform::NPUPlace(0))); uintptr_t slice_data_address = reinterpret_cast(slice_tensor.data()); uintptr_t slice_mutable_data_address = reinterpret_cast(slice_tensor.mutable_data( - slice_tensor.dims(), platform::CUDAPlace())); + slice_tensor.dims(), platform::NPUPlace(0))); EXPECT_EQ(src_data_address, src_mutable_data_address); EXPECT_EQ(slice_data_address, slice_mutable_data_address); EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index c6ac30a369859db9de244990231a307074e973ed..78fd1af09e29458ec84549c55dd99f8c29da29db 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -97,6 +97,42 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif +#ifdef PADDLE_WITH_ASCEND_CL + // TODO(zhiqiu): handle different condition like CUDA code below + else if (platform::is_npu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, + stream); + } + else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_npu_place(dst_place)) { + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, + stream); + } + else if (platform::is_npu_place(src_place) && // NOLINT + platform::is_npu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data async from " << src_place << " to " + << dst_place; + return; + } + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, + stream); + } + else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "Copy from %s to %s is not supported.", src_place, dst_place)); + } +#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { @@ -304,6 +340,35 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif +#ifdef PADDLE_WITH_ASCEND_CL + else if (platform::is_npu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { /* npu -> cpu*/ + memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, + nullptr); + } + else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_npu_place(dst_place)) { /* cpu -> npu*/ + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, + nullptr); + } + else if (platform::is_npu_place(src_place) && // NOLINT + platform::is_npu_place(dst_place)) { /* npu -> npu*/ + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data sync from " << src_place << " to " + << dst_place; + return; + } + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, + nullptr); + } + else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "Copy from %s to %s is not supported.", src_place, dst_place)); + } +#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { @@ -431,6 +496,13 @@ class AnyVisitor : public boost::static_visitor { return GetResultHelper(out, gpu); } + bool GetResult(const framework::Tensor& out, + const platform::NPUPlace& npu) const { + PADDLE_THROW( + platform::errors::Unimplemented("Not supported on place (%s) ", npu)); + // return GetResultHelper(out, npu); + } + bool GetResult(const framework::Tensor& out, const platform::CPUPlace& cpu) const { return *out.data(); @@ -633,6 +705,10 @@ struct BothFalseVisitor : public boost::static_visitor<> { #endif } + void VisitorImpl(const platform::NPUPlace& npu) const { + // TODO(zhiqiu) + } + void VisitorImpl(const platform::CPUPlace& cpu) const { int num = in_.numel(); const bool* in_ptr = in_.data(); @@ -746,6 +822,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor, #else PADDLE_THROW(platform::errors::Unimplemented( "XPUPlace is not supported when not compiled with XPU")); +#endif + } else if (platform::is_npu_place(tensor.place())) { +#ifdef PADDLE_WITH_ASCEND_CL + constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB + std::unique_ptr buf(new char[kBufSize]); + auto& npu_dev_ctx = + static_cast(dev_ctx); + platform::CPUPlace cpu; + uintptr_t data = reinterpret_cast(data_ptr); + while (size != 0) { + size_t size_to_write = std::min(kBufSize, static_cast(size)); + memory::Copy(cpu, buf.get(), + BOOST_GET_CONST(platform::NPUPlace, tensor.place()), + reinterpret_cast(data), size_to_write, + npu_dev_ctx.stream()); + npu_dev_ctx.Wait(); + os.write(buf.get(), size_to_write); + data += size_to_write; + size -= size_to_write; + } +#else + PADDLE_THROW(platform::errors::Unimplemented( + "NPUPlace is not supported when not compiled with NPU")); #endif } else { os.write(static_cast(data_ptr), @@ -801,9 +900,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor, auto ctx = platform::CPUDeviceContext(); size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace()) || - platform::is_xpu_place(dev_ctx.GetPlace())) { + platform::is_xpu_place(dev_ctx.GetPlace()) || + platform::is_npu_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) + defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL) Tensor cpu_tensor; cpu_tensor.Resize(framework::make_ddim(shape)); framework::VisitDataType( @@ -812,13 +912,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor, is.read(static_cast(buf), size); auto dst_place = dev_ctx.GetPlace(); framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); + if (platform::is_npu_place(dev_ctx.GetPlace())) { + dev_ctx.Wait(); + } #else if (platform::is_gpu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "CUDAPlace is not supported when not compiled with CUDA")); - } else { + } else if (platform::is_xpu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "XPUPlace is not supported when not compiled with XPU")); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "NPUPlace is not supported when not compiled with NPU")); } #endif } else { @@ -859,9 +965,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor, auto ctx = platform::CPUDeviceContext(); size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace()) || - platform::is_xpu_place(dev_ctx.GetPlace())) { + platform::is_xpu_place(dev_ctx.GetPlace()) || + platform::is_npu_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) + defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL) Tensor cpu_tensor; cpu_tensor.Resize(framework::make_ddim(dims)); framework::VisitDataType( @@ -870,13 +977,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor, is.read(static_cast(buf), size); auto dst_place = dev_ctx.GetPlace(); framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); + if (platform::is_npu_place(dev_ctx.GetPlace())) { + dev_ctx.Wait(); + } #else if (platform::is_gpu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "CUDAPlace is not supported when not compiled with CUDA")); - } else { + } else if (platform::is_xpu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "XPUPlace is not supported when not compiled with XPU")); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "NPUPlace is not supported when not compiled with NPU")); } #endif } else { diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 8a127e0ed5929525a96914c70f20411a61ed88e6..22c8e1c1665f121cda6ba33f23cb7fc0749da025 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -18,7 +18,6 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/dlpack_tensor.h" #include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" @@ -136,6 +135,7 @@ void TensorFromArray(const T* src, const size_t& array_size, } #endif } + template void TensorFromVector(const std::vector& src, const platform::DeviceContext& ctx, Tensor* dst) { @@ -158,6 +158,59 @@ void TensorFromVector(const std::vector& src, reinterpret_cast(ctx).stream()); } #endif +#ifdef PADDLE_WITH_ASCEND_CL + // NOTE(zhiqiu): Becareful that aclrtMemcpyAsync is different from + // cudaMemcpyAsync. + // cudaMemcpyAsync is actually "sync" between cpu <-> gpu. + // aclrtMemcpyAsync is really "async" between cpu <-> npu. + // Since vector is on cpu, I think this function should be a "sync" operation, + // so pass nullptr as stream to memory::Copy(). + else if (platform::is_npu_place(dst_place)) { // NOLINT + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + src_place, src_ptr, size, nullptr); + } +#endif +} + +// The fully specialized function should be inline to avoid +// multi-definition. +template <> +inline void TensorFromVector(const std::vector& src, + const platform::DeviceContext& ctx, Tensor* dst) { + // vector has no data() member, use array instead. + // See details: + // https://stackoverflow.com/questions/46115669/why-does-stdvectorbool-have-no-data/46115714 + bool* array = new bool[src.size()]; + for (unsigned int i = 0; i < src.size(); i++) { + array[i] = static_cast(src[i]); + } + + auto dst_place = ctx.GetPlace(); + auto src_ptr = static_cast(array); + platform::CPUPlace src_place; + dst->Resize({static_cast(src.size())}); + auto dst_ptr = static_cast(dst->mutable_data(dst_place)); + auto size = src.size() * sizeof(bool); + + if (platform::is_cpu_place(dst_place)) { + memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, + src_place, src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(dst_place)) { // NOLINT + memory::Copy( + BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place, + src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + else if (platform::is_npu_place(dst_place)) { // NOLINT + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + src_place, src_ptr, size, nullptr); + } +#endif + delete[] array; } template @@ -172,6 +225,23 @@ void TensorFromVector(const std::vector& src, Tensor* dst) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } +template <> +inline void TensorFromVector(const std::vector& src, Tensor* dst) { + bool* array = new bool[src.size()]; + for (unsigned int i = 0; i < src.size(); i++) { + array[i] = static_cast(src[i]); + } + platform::CPUPlace dst_place = platform::CPUPlace(); + auto src_ptr = static_cast(array); + platform::CPUPlace src_place; + dst->Resize({static_cast(src.size())}); + auto dst_ptr = static_cast(dst->mutable_data(dst_place)); + auto size = src.size() * sizeof(bool); + + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + delete[] array; +} + template void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, std::vector* dst) { @@ -195,6 +265,52 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, reinterpret_cast(ctx).stream()); } #endif +#ifdef PADDLE_WITH_ASCEND_CL + else if (platform::is_npu_place(src.place())) { // NOLINT + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr, + size, nullptr); + } +#endif +} + +template <> +inline void TensorToVector(const Tensor& src, + const platform::DeviceContext& ctx, + std::vector* dst) { + auto src_ptr = static_cast(src.data()); + auto size = src.numel() * sizeof(bool); + + bool* array = new bool[src.numel()]; + + platform::CPUPlace dst_place; + dst->resize(src.numel()); + auto dst_ptr = static_cast(array); + + if (platform::is_cpu_place(src.place())) { + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, + size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(src.place())) { // NOLINT + memory::Copy( + dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()), + src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + else if (platform::is_npu_place(src.place())) { // NOLINT + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr, + size, nullptr); + } +#endif + for (unsigned int i = 0; i < src.numel(); i++) { + (*dst)[i] = static_cast(array[i]); + } + delete[] array; } template @@ -216,6 +332,32 @@ void TensorToVector(const Tensor& src, std::vector* dst) { BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size); } +template <> +inline void TensorToVector(const Tensor& src, std::vector* dst) { + auto src_ptr = static_cast(src.data()); + auto size = src.numel() * sizeof(bool); + + bool* array = new bool[src.numel()]; + + platform::CPUPlace dst_place; + dst->resize(src.numel()); + auto dst_ptr = static_cast(array); + + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(src.place()), true, + platform::errors::InvalidArgument( + "The input tensor should be CPU device, but actually it is in %s.", + src.place())); + + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size); + + for (unsigned int i = 0; i < src.numel(); i++) { + (*dst)[i] = static_cast(array[i]); + } + delete[] array; +} + std::ostream& operator<<(std::ostream& os, const Tensor& t); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc index c32efd0a470be201344fa8d7f817792315b7e6ef..8587ee8d1e91969be86ab50d18e70b6a0d034e98 100644 --- a/paddle/fluid/framework/tensor_util_test.cc +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -242,6 +242,61 @@ TEST(TensorToVector, Tensor) { #endif } +TEST(TensorToVector, Tensor_bool) { + { + paddle::framework::Tensor src; + bool* src_ptr = + src.mutable_data({3, 3}, paddle::platform::CPUPlace()); + for (int i = 0; i < 3 * 3; ++i) { + src_ptr[i] = static_cast(i % 2); + } + + paddle::platform::CPUPlace place; + std::vector dst; + paddle::framework::TensorToVector(src, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_ptr[i], dst[i]); + } + } +#ifdef PADDLE_WITH_CUDA + { + std::vector src_vec = { + false, true, false, true, false, true, false, true, false, + }; + paddle::framework::Tensor gpu_tensor; + paddle::platform::CUDAPlace place; + paddle::platform::CUDADeviceContext gpu_ctx(place); + paddle::framework::TensorFromVector(src_vec, gpu_ctx, &gpu_tensor); + + std::vector dst; + paddle::framework::TensorToVector(gpu_tensor, gpu_ctx, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_vec[i], dst[i]); + } + } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + { + std::vector src_vec = { + false, true, false, true, false, true, false, true, false, + }; + paddle::framework::Tensor npu_tensor; + paddle::platform::NPUPlace place(0); + paddle::platform::NPUDeviceContext npu_ctx(place); + paddle::framework::TensorFromVector(src_vec, npu_ctx, &npu_tensor); + + std::vector dst; + paddle::framework::TensorToVector(npu_tensor, npu_ctx, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_vec[i], dst[i]); + } + } +#endif +} + TEST(TensorFromDLPack, Tensor) { { std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index ca290a50b42fe0bfe37385c15bc54d8c5a305a06..636760029fedc4e3a570f9a63db5d1f84795ab62 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -27,8 +27,8 @@ limitations under the License. */ #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/fleet/heter_context.h" -#include "paddle/fluid/framework/fleet/heter_wrapper.h" -#include "paddle/fluid/framework/heter_service.h" +//#include "paddle/fluid/framework/fleet/heter_wrapper.h" +#include "paddle/fluid/framework/heter_util.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/reader.h" @@ -47,6 +47,10 @@ class PullDenseWorker; class Scope; class VarDesc; class DeviceWorker; +class HeterWrapper; +class HeterRequest; +class HeterResponse; + template class ChannelObject; @@ -109,13 +113,22 @@ class MultiTrainer : public TrainerBase { virtual Scope* GetWorkerScope(int thread_id); virtual std::string GetDumpPath(int tid); + template + void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor); +#ifdef PADDLE_WITH_HETERPS + + void MergeDenseParam(); +#endif + protected: int thread_num_; std::vector threads_; std::vector readers_; std::vector> workers_; std::vector need_merge_var_names_; - +#ifdef PADDLE_WITH_HETERPS + std::vector places_; +#endif int mpi_rank_; int mpi_size_; int dump_file_num_; @@ -313,7 +326,6 @@ class PSGPUTrainer : public TrainerBase { float scale_datanorm_; paddle::platform::Place place_; ProgramDesc program_; - std::shared_ptr fleet_ptr_; std::shared_ptr pull_dense_worker_; std::vector> workers_; std::vector places_; @@ -324,7 +336,8 @@ class PSGPUTrainer : public TrainerBase { }; #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_ASCEND_CL) class PipelineTrainer : public TrainerBase { public: PipelineTrainer() {} diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto index 70481cf3727012e4cf41d235154eb277d92cc92f..504885ff5ccbce760c0a659aedabef6790de5f1a 100644 --- a/paddle/fluid/framework/trainer_desc.proto +++ b/paddle/fluid/framework/trainer_desc.proto @@ -93,6 +93,9 @@ message SectionWorkerParameter { optional int32 start_cpu_core_id = 4 [ default = 1 ]; repeated string param_need_sync = 5; optional int32 num_microbatches = 6; + optional int32 num_pipeline_stages = 7 [ default = 1 ]; + optional int32 pipeline_stage = 8 [ default = 1 ]; + optional int32 schedule_mode = 9 [ default = 0 ]; } message SectionConfig { diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc index 6b9dbece8974c286a390627f25e4a25ee8bfb8d3..15073b6f78c5b35209c0c38135d067cb660e487e 100644 --- a/paddle/fluid/framework/trainer_factory.cc +++ b/paddle/fluid/framework/trainer_factory.cc @@ -76,7 +76,8 @@ REGISTER_TRAINER_CLASS(HeterBoxTrainer); (defined PADDLE_WITH_PSLIB) REGISTER_TRAINER_CLASS(PSGPUTrainer); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_ASCEND_CL) REGISTER_TRAINER_CLASS(PipelineTrainer); #endif } // namespace framework diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index a2b5a98401e2363bbfe98b375807ba91e7b5a2ae..e43cccfe648165ce962b779cb513effe990d0ab3 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -45,6 +45,17 @@ using Attribute = boost::variant< using AttributeMap = std::unordered_map; +#ifdef PADDLE_WITH_ASCEND_CL +using NPUAttribute = + boost::variant, + std::vector, std::vector, bool, + std::vector, BlockDesc*, int64_t, + std::vector, std::vector, + std::vector, std::vector>>; + +using NPUAttributeMap = std::unordered_map; +#endif + using OpCreator = std::function; diff --git a/paddle/fluid/framework/unused_var_check.cc b/paddle/fluid/framework/unused_var_check.cc index d2adbdd34512b3404550eb48aafd619ce015a028..0f8465ab8948e425ec48d10052643699e3c10ce7 100644 --- a/paddle/fluid/framework/unused_var_check.cc +++ b/paddle/fluid/framework/unused_var_check.cc @@ -53,27 +53,28 @@ static const std::unordered_set &GetOpWithUnusedVarAllowSet() { // Use pointer here for safe static deinitialization static auto *allow_set = new std::unordered_set({ // called once - "batch_norm", // 0 - "batch_norm_grad", // 0 - "sync_batch_norm", // 0 - "sync_batch_norm_grad", // 0 - "inplace_abn", // 0 - "inplace_abn_grad", // 0 - "dgc_momentum", // 0 - "fake_quantize_range_abs_max", // 0 - "rmsprop", // 0 - "sequence_conv_grad", // 0 - "roi_perspective_transform_grad", // 0 - "fill_zeros_like", // 1 - "fill_any_like", // 1 - "nce_grad", // 1 - "precision_recall", // 1 - "fusion_seqpool_cvm_concat", // 2 - "fused_batch_norm_act", // 2 - "fused_batch_norm_act_grad", // 2 - "data_norm", // 0 - "data_norm_grad", // 0 - "update_loss_scaling", // 0 + "batch_norm", // 0 + "batch_norm_grad", // 0 + "sync_batch_norm", // 0 + "sync_batch_norm_grad", // 0 + "inplace_abn", // 0 + "inplace_abn_grad", // 0 + "dgc_momentum", // 0 + "fake_quantize_range_abs_max", // 0 + "rmsprop", // 0 + "sequence_conv_grad", // 0 + "roi_perspective_transform_grad", // 0 + "fill_zeros_like", // 1 + "fill_any_like", // 1 + "nce_grad", // 1 + "precision_recall", // 1 + "fusion_seqpool_cvm_concat", // 2 + "fused_batch_norm_act", // 2 + "fused_batch_norm_act_grad", // 2 + "data_norm", // 0 + "data_norm_grad", // 0 + "update_loss_scaling", // 0 + "fused_embedding_eltwise_layernorm", // 0 }); return *allow_set; } diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h index 8affeda67b3d07d67ceed2b657b285210e1bd076..2e35f9b845ac730b22841df70c5d20b1b6cedd45 100644 --- a/paddle/fluid/framework/var_type.h +++ b/paddle/fluid/framework/var_type.h @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index b0d8f43a90f35f08c7bcc09858842424f53ddbe0..473df85aa0421ea74280029a11ad613f8537d6bd 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -21,7 +21,6 @@ #include #include "paddle/fluid/framework/feed_fetch_type.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_CUDA @@ -37,6 +36,11 @@ #endif #endif +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include +#endif + #if defined(PADDLE_WITH_XPU_BKCL) #include "xpu/bkcl.h" #endif @@ -51,6 +55,10 @@ class Communicator; class NCCLCommunicator; #endif #endif +#ifdef PADDLE_WITH_ASCEND_CL +class Communicator; +class HCCLCommunicator; +#endif #if defined(PADDLE_WITH_XPU_BKCL) class BKCLCommunicator; @@ -163,6 +171,9 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< #endif operators::CudnnRNNCache, #endif +#if defined(PADDLE_WITH_ASCEND_CL) + HcclRootInfo, +#endif #if defined(PADDLE_WITH_XPU_BKCL) BKCLUniqueId, platform::BKCLCommunicator, #endif diff --git a/paddle/fluid/framework/variable_helper.h b/paddle/fluid/framework/variable_helper.h index 6e65bc2c932877e5365f4533631d50afae4465b3..4cdfba29249ccf257dab3bd2f1707cd7037c2e33 100644 --- a/paddle/fluid/framework/variable_helper.h +++ b/paddle/fluid/framework/variable_helper.h @@ -15,7 +15,6 @@ limitations under the License. */ #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/variable.h" namespace paddle { diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 5cfa2c2278662d6164464450e0993d3fe676382c..df4dcc6b5d91f3b6e18322116edafe7c133062bd 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -4,7 +4,7 @@ cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry) add_subdirectory(jit) cc_library(amp SRCS amp_auto_cast.cc DEPS layer ) -cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp) +cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal) cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator) cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator) cc_library(imperative_profiler SRCS profiler.cc) diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index a56458b21398b31dd036cf840e4b837a8dc7eba4..b4154737e0fbc6245617fb0208f6623e4ebb5943 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -26,7 +26,24 @@ class VarBase; AmpOperators::AmpOperators() : allow_ops_(new std::unordered_set()), - block_ops_(new std::unordered_set()) {} + block_ops_(new std::unordered_set()), + unsupported_fp16_ops_(new std::unordered_set()) { + auto& all_kernels = framework::OperatorWithKernel::AllOpKernels(); + auto fp16_dtype = framework::proto::VarType::FP16; + for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) { + bool supported = false; + for (auto& kernel_type : it->second) { + if (platform::is_gpu_place(kernel_type.first.place_) && + kernel_type.first.data_type_ == fp16_dtype) { + supported = true; + } + } + if (!supported) { + unsupported_fp16_ops_->insert(it->first); + } + } +} + AmpOperators::~AmpOperators() {} AmpOperators& AmpOperators::Instance() { @@ -44,16 +61,26 @@ AmpOperators::GetMutableBlockOps() { return block_ops_; } +std::shared_ptr> +AmpOperators::GetMutableUnsupportedFp16Ops() { + return unsupported_fp16_ops_; +} + std::ostream& operator<<(std::ostream& os, AmpOperators& ops) { os << "allow ops: "; auto allow_ops = ops.GetMutableAllowOps(); std::copy((*allow_ops).begin(), (*allow_ops).end(), std::ostream_iterator(os, " ")); - os << "; "; + os << "\n"; os << "block ops: "; auto block_ops = ops.GetMutableBlockOps(); std::copy((*block_ops).begin(), (*block_ops).end(), std::ostream_iterator(os, " ")); + os << "\n"; + os << "unsupported fp16 ops: "; + auto unsupported_fp16_ops = ops.GetMutableUnsupportedFp16Ops(); + std::copy((*unsupported_fp16_ops).begin(), (*unsupported_fp16_ops).end(), + std::ostream_iterator(os, " ")); return os; } @@ -133,7 +160,8 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type, if (AmpOperators::Instance().GetMutableAllowOps()->count(op_type)) { for (auto& pair : new_ins) { // NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16. - if ((op_type == "batch_norm" || op_type == "layer_norm") && + if ((op_type == "batch_norm" || op_type == "layer_norm" || + op_type == "sync_batch_norm") && pair.first != "X") { continue; } @@ -156,9 +184,16 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type, return new_ins; } else { auto dst_type = GetPromoteType(ins); + // NOTE(zhiqiu): if the op has op fp16 kernel, fall back to fp32. + if (dst_type == framework::proto::VarType::FP16 && + AmpOperators::Instance().GetMutableUnsupportedFp16Ops()->count( + op_type)) { + dst_type = framework::proto::VarType::FP32; + } for (auto& pair : new_ins) { // NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16. - if ((op_type == "batch_norm" || op_type == "layer_norm") && + if ((op_type == "batch_norm" || op_type == "layer_norm" || + op_type == "sync_batch_norm") && pair.first == "X" && dst_type == framework::proto::VarType::FP32) { continue; } diff --git a/paddle/fluid/imperative/amp_auto_cast.h b/paddle/fluid/imperative/amp_auto_cast.h index 619c6b0baf896f74eb6eae998801697ec4de6fb0..fa76c19688a693e036dd06bb54ad24ddf748a8af 100644 --- a/paddle/fluid/imperative/amp_auto_cast.h +++ b/paddle/fluid/imperative/amp_auto_cast.h @@ -40,6 +40,9 @@ class AmpOperators { std::shared_ptr> GetMutableBlockOps(); + std::shared_ptr> + GetMutableUnsupportedFp16Ops(); + private: AmpOperators(); // forbid calling default constructor @@ -50,6 +53,9 @@ class AmpOperators { // The set of ops that support fp16 calculation and are considered numerically // dangerous and whose effects may also be observed in downstream ops. std::shared_ptr> block_ops_; + + // The set of ops that has no fp16 CUDA kennel. + std::shared_ptr> unsupported_fp16_ops_; }; std::ostream& operator<<(std::ostream& os, AmpOperators& ops); diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc index 29ba54986801f117074a415084b3e0f10675954b..7bcc3d6c608c947f71ae030cfb17d4a89495939e 100644 --- a/paddle/fluid/imperative/basic_engine.cc +++ b/paddle/fluid/imperative/basic_engine.cc @@ -36,48 +36,73 @@ DECLARE_bool(sort_sum_gradient); namespace paddle { namespace imperative { -void BasicEngine::Init(VarBase* var, bool retain_graph) { +void BasicEngine::Init( + const std::vector>& tensors, + const std::vector>& grad_tensors, + bool retain_graph) { retain_graph_ = retain_graph; - init_node_ = var->GradVarBase()->GradNode(); - PADDLE_ENFORCE_EQ(var->GradVarBase()->GraphIsFreed(), false, - platform::errors::Unavailable( - "%s trying to backward through the same graph a second " - "time, but this graph have already been freed. Please " - "specify Tensor.backward(retain_graph=True) when " - "calling backward at the first time.", - var->Name())); - - if (!retain_graph) { - VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name() - << " because of retain_graph=False when calling backward"; - var->GradVarBase()->SetGraphIsFreed(true); - var->GradVarBase()->ClearGradNode(); - } - if (init_node_ == nullptr || var->OverridedStopGradient()) { - VLOG(3) << "Skip auto grad since there is no grad op for var or loss is " - "stop_gradient=True: " - << var->Name(); - return; - } + PADDLE_ENFORCE_EQ( + tensors.size(), grad_tensors.size(), + platform::errors::Unavailable( + "The size of tensors do not equal the size of grad_tensors," + "the size of tensors is %s, but the size of grad_tensors is %s.", + tensors.size(), grad_tensors.size())); + + for (size_t i = 0; i < tensors.size(); ++i) { + auto var = tensors[i]; + auto grad_tensor = grad_tensors[i]; + + auto init_node = var->GradVarBase()->GradNode(); + PADDLE_ENFORCE_EQ( + var->GradVarBase()->GraphIsFreed(), false, + platform::errors::Unavailable( + "%s trying to backward through the same graph a second " + "time, but this graph have already been freed. Please " + "specify Tensor.backward(retain_graph=True) when " + "calling backward at the first time.", + var->Name())); + + if (!retain_graph) { + VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name() + << " because of retain_graph=False when calling backward"; + var->GradVarBase()->SetGraphIsFreed(true); + var->GradVarBase()->ClearGradNode(); + } - VLOG(3) << "Init first node of backward"; + if (init_node == nullptr || var->OverridedStopGradient()) { + VLOG(3) << "Skip auto grad since there is no grad op for var or loss is " + "stop_gradient=True: " + << var->Name(); + continue; + } - PADDLE_ENFORCE_EQ( - var->HasGradVar(), true, - platform::errors::NotFound("Grad variable not exist for variable %s", - var->Name())); - - auto& fwd_var = var->Var().Get(); - auto* grad_var = - var->GradVarBase()->MutableVar()->GetMutable(); - VLOG(6) << "init loss grad:" << var->GradVarBase()->Name() - << " as stop_gradient false"; - var->GradVarBase()->InnerSetOverridedStopGradient(false); - auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place()); - grad_var->Resize(fwd_var.dims()); - grad_var->mutable_data(fwd_var.place(), fwd_var.type()); - operators::math::set_constant(*dev_ctx, grad_var, 1.0); + VLOG(3) << "Init node of backward"; + + PADDLE_ENFORCE_EQ( + var->HasGradVar(), true, + platform::errors::NotFound("Tensor %s has no gradient", var->Name())); + + auto& fwd_var = var->Var().Get(); + auto* grad_var = + var->GradVarBase()->MutableVar()->GetMutable(); + VLOG(6) << "init loss grad:" << var->GradVarBase()->Name() + << " as stop_gradient false"; + var->GradVarBase()->InnerSetOverridedStopGradient(false); + auto* dev_ctx = + platform::DeviceContextPool::Instance().Get(fwd_var.place()); + if (grad_tensor == nullptr) { + grad_var->Resize(fwd_var.dims()); + grad_var->mutable_data(fwd_var.place(), fwd_var.type()); + operators::math::set_constant(*dev_ctx, grad_var, 1.0); + } else { + paddle::framework::TensorCopy( + grad_tensor->Var().Get(), fwd_var.place(), + *dev_ctx, grad_var); + } + + init_nodes_.push_back(init_node); + } } void BasicEngine::CheckBackwardInputs(const OpBase& op) { @@ -141,17 +166,6 @@ void BasicEngine::PrepareGradAccumulators( << var.get() << ") that don't have grad node with reference count " << accumulator->RefCnt(); - - if (var->HasLeafHooks()) { - VLOG(3) << "Grad variable wrapper (" << var->Name() - << ") has leaf grad hooks."; - PADDLE_ENFORCE_NE( - var->HasGradNode(), true, - platform::errors::PermissionDenied( - "Only leaf Tensor's gradient can append hook to " - "Gradientaccumulator.")); - accumulator->SetPostHooks(var->GetLeafHooks()); - } } else { // Because Inplace op overwrites the grad_node of the input grad_var. So // only the information of grad_pending_node can be used to find the @@ -235,8 +249,10 @@ void BasicEngine::PrepareDeps() { std::queue q; std::unordered_set visited; - q.push(init_node_.get()); - visited.insert(init_node_.get()); + for (size_t i = 0; i < init_nodes_.size(); ++i) { + q.push(init_nodes_[i].get()); + visited.insert(init_nodes_[i].get()); + } while (!q.empty()) { auto* cur_node = q.front(); @@ -262,15 +278,41 @@ void BasicEngine::PrepareDeps() { } } +static std::shared_ptr> CallGradientHooks( + const NameVarMap& bwd_ins, const std::string& op_type) { + std::shared_ptr> tmp_ins_ptr = nullptr; + for (const auto& pair : bwd_ins) { + for (size_t i = 0; i < pair.second.size(); ++i) { + auto& var = pair.second[i]; + if (var->HasVariableWrapperHook()) { + if (tmp_ins_ptr == nullptr) { + tmp_ins_ptr = std::make_shared>(bwd_ins); + } + VLOG(3) << "Call " << var->GetVariableWrapperHooks().size() + << " hooks of " << op_type << "'s input `" << pair.first + << "`'s var `" << var->Name() << "`."; + auto tmp_var = var; + for (const auto& hook_pair : var->GetVariableWrapperHooks()) { + tmp_var = (*hook_pair.second)(tmp_var); + } + (*tmp_ins_ptr)[pair.first][i] = tmp_var; + } + } + } + return tmp_ins_ptr; +} + void BasicEngine::Execute() { - if (init_node_ == nullptr) { + if (init_nodes_.empty()) { return; } PrepareDeps(); // Start execute Computation graph std::queue> q; - q.push(std::move(init_node_)); + for (size_t i = 0; i < init_nodes_.size(); ++i) { + q.push(std::move(init_nodes_[i])); + } size_t op_num = 0; @@ -292,10 +334,15 @@ void BasicEngine::Execute() { auto& bwd_ins = cur_op.GetInsMap(); auto& bwd_outs = cur_op.GetOutsMap(); + /** + * [ Why need temporary outputs here? ] + * + * - construct the temp output map, avoid to disrupt graph + * - replace the element in the map by temp var, because a + * var may be coresponding to several grad var in one op + */ NameVarMap tmp_outs(bwd_outs); - // 1. construct the temp output map, avoid to disrupt graph - // 2. replace the element in the map by temp var, because a - // var may be coresponding to several grad var in one op + for (auto& pair : tmp_outs) { if (!pair.second.IsGrad()) { continue; @@ -361,7 +408,8 @@ void BasicEngine::Execute() { VLOG(10) << "create temporary var of " << var->Name() << " for sum gradient within this graph!"; } else if (!inplace_grad_name_map.empty() && - inplace_grad_name_map.count(pair.first)) { + inplace_grad_name_map.count(pair.first) && + bwd_ins.count(inplace_grad_name_map.at(pair.first))) { // When calculate Inplace grad op, create a new output var. // If a tmp var has been created, there is no need to create it // again. @@ -408,10 +456,36 @@ void BasicEngine::Execute() { } } + /** + * [ Why need temporary inputs here? ] + * + * - Hook execution should not change original input tensor. + * User can register hook for Tensor's gradient, It is expected + * that the hook only affects the gradient of the backward + * propagation, and does not affect the gradient value input + * as the hook. + * - use `tmp_ins_ptr`, only copy bwd_ins when the var in bwd_ins + * hold hooks + */ + auto tmp_ins_ptr = CallGradientHooks(bwd_ins, cur_op.Type()); + { VLOG(3) << "Start to execute grad op " << cur_op.Type(); - OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(), - cur_op.place()); + try { + if (tmp_ins_ptr == nullptr) { + OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(), + cur_op.place()); + } else { + OpBase::Run(cur_op.InnerOp(), *tmp_ins_ptr, tmp_outs, + cur_op.Attrs(), cur_op.place()); + } + } catch (platform::EnforceNotMet& exception) { + Clear(); + throw std::move(exception); + } catch (std::exception& ex) { + Clear(); + PADDLE_THROW(platform::errors::External("%s", ex.what())); + } } for (auto& pair : inplace_output_grad_var_list_) { @@ -428,15 +502,14 @@ void BasicEngine::Execute() { if (!accumulator->SumGradCompleted()) { continue; } - // 1. Call Hooks for **inner_var_** + // 1. Call Hooks for `inner_var_` + accumulator->CallGradientHooks(); - // 2. Sum Gradient with Previous Graph + // 2. Sum Gradient `inner_var_` to `var_` of Current or Previous Graph accumulator->AccumulateGrad(); - // 3. Call backward Hooks for **var_** - if (accumulator->HasPostHooks()) { - accumulator->CallBackwardPostHooks(); - } + // 3. Call backward Hooks for `var_` + accumulator->CallReduceHooks(); } need_accu_var_list_.clear(); @@ -470,7 +543,7 @@ void BasicEngine::Execute() { } void BasicEngine::Clear() { - init_node_.reset(); + init_nodes_.clear(); node_deps_.clear(); accumulators_.clear(); accumulators_with_grad_node_.clear(); diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h index a2ad8b5f8aa61e438e5afdc72681a96ee21c996b..49761a8df0b6b1d8494e72b6ea7b67c0fa15eb6b 100644 --- a/paddle/fluid/imperative/basic_engine.h +++ b/paddle/fluid/imperative/basic_engine.h @@ -30,7 +30,9 @@ class OpBase; class BasicEngine : public Engine { public: - void Init(VarBase* var, bool retain_graph = false); + void Init(const std::vector>& tensors, + const std::vector>& grad_tensors, + bool retain_graph = false); void Execute() override; @@ -46,7 +48,7 @@ class BasicEngine : public Engine { void Clear(); private: - std::shared_ptr init_node_; + std::vector> init_nodes_; std::unordered_map node_deps_; // The input and output of Inplace op are the same. If only `var` is used // as the key, then the input and output of inplace op must be gradient diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc index 873068a0d310dc38dab867e73ff4577aae3a6f23..16f9454e9376e4368a478cf8adf9e3f988868785 100644 --- a/paddle/fluid/imperative/bkcl_context.cc +++ b/paddle/fluid/imperative/bkcl_context.cc @@ -19,12 +19,11 @@ #include #include +#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/bkcl_helper.h" #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/gen_comm_id_helper.h" - -#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/split.h" #include "paddle/fluid/string/string_helper.h" @@ -77,7 +76,7 @@ void BKCLParallelContext::Init() { bkcl_ids.resize(strategy_.nrings_); if (strategy_.local_rank_ == 0) { - // generate the unique ncclid on the root worker + // generate the unique bkclid on the root worker for (size_t i = 0; i < bkcl_ids.size(); ++i) { auto ret = bkcl_get_unique_id(&bkcl_ids[i]); PADDLE_ENFORCE_EQ(BKCL_SUCCESS, ret, @@ -99,6 +98,28 @@ void BKCLParallelContext::Init() { } } +void BKCLParallelContext::InitWithRingID(int ring_id) { + std::vector bkcl_ids; + bkcl_ids.resize(1); + + if (strategy_.local_rank_ == 0) { + // generate the unique bkclid on the root worker + auto ret = bkcl_get_unique_id(&bkcl_ids[0]); + PADDLE_ENFORCE_EQ(BKCL_SUCCESS, ret, + platform::errors::PreconditionNotMet( + "BKCL get unique id failed [%d]", ret)); + } + BcastBKCLId(bkcl_ids, 0); + + int xpu_id = BOOST_GET_CONST(platform::XPUPlace, place_).device; + VLOG(0) << "init BKCL context nranks: " << strategy_.nranks_ + << " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id + << " ring id: " << ring_id; + // it will assign bkcl_comm in XPUDeviceContext within ring_id + platform::BKCLCommContext::Instance().CreateBKCLComm( + &bkcl_ids[0], strategy_.nranks_, strategy_.local_rank_, xpu_id, ring_id); +} + void BKCLParallelContext::AllReduceByStream(const framework::Variable &src, framework::Variable *dst, int ring_id, bool use_calc_stream) { @@ -146,8 +167,6 @@ void BKCLParallelContext::WaitCompute(int ring_id) { platform::errors::OutOfRange("Ring id expected < nrings," "but got ring id = %d, nrings = %d", ring_id, strategy_.nrings_)); - // TODO(wangxi16): [Performance optimize] Maybe need to put Wait and - // bkcl_allreduce to comm thread, for bkcl_allreduce is blocking now. auto compute_dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place_)); compute_dev_ctx->Wait(); @@ -167,6 +186,12 @@ void BKCLParallelContext::WaitComm(int ring_id) { comm_dev_ctx->Wait(); } +void BKCLParallelContext::SynchronizeCompute() { + auto compute_dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place_)); + compute_dev_ctx->Wait(); +} + } // namespace imperative } // namespace paddle #endif diff --git a/paddle/fluid/imperative/bkcl_context.h b/paddle/fluid/imperative/bkcl_context.h index d7d917f20082ac6e49bae604cdde04343b4dced4..652b7689666c6c66c4efe6edda0c23acfc0cab27 100644 --- a/paddle/fluid/imperative/bkcl_context.h +++ b/paddle/fluid/imperative/bkcl_context.h @@ -36,6 +36,8 @@ class BKCLParallelContext : public ParallelContext { void Init() override; + void InitWithRingID(int ring_id) override; + void AllReduceByStream(const framework::Variable& src, framework::Variable* dst, int ring_id, bool use_calc_stream) override; @@ -45,6 +47,8 @@ class BKCLParallelContext : public ParallelContext { void WaitCompute(int ring_id) override; void WaitComm(int ring_id) override; + + void SynchronizeCompute() override; }; } // namespace imperative diff --git a/paddle/fluid/imperative/data_loader.cc b/paddle/fluid/imperative/data_loader.cc index 71ea82e9a19e89965d5faf712fc5a9411c28db3e..c43149c9b563e73d7bdc8cbb39b8303083d2ac84 100644 --- a/paddle/fluid/imperative/data_loader.cc +++ b/paddle/fluid/imperative/data_loader.cc @@ -71,9 +71,12 @@ void EraseLoadProcessPIDs(int64_t key) { } \ } while (0) -#define REGISTER_SIGNAL_HANDLER(SIGNAL, HANDLER_NAME) \ - static void HANDLER_NAME(int sig, siginfo_t *info, void *ctx) { \ - SIGNAL_HANDLE(SIGNAL); \ +#define REGISTER_SIGNAL_HANDLER(SIGNAL, HANDLER_NAME, ERROR_MSG) \ + static void HANDLER_NAME(int sig, siginfo_t *info, void *ctx) { \ + auto _w = \ + write(STDERR_FILENO, ERROR_MSG, sizeof(ERROR_MSG) / sizeof(char)); \ + (void)_w; \ + SIGNAL_HANDLE(SIGNAL); \ } #define REGISTER_SPEC_SIGNAL_HANDLER(SIGNAL, HANDLER_NAME) \ @@ -84,8 +87,18 @@ void EraseLoadProcessPIDs(int64_t key) { SIGNAL_HANDLE(SIGNAL); \ } -REGISTER_SIGNAL_HANDLER(SIGSEGV, SIGSEGV_handler); -REGISTER_SIGNAL_HANDLER(SIGBUS, SIGBUS_handler); +REGISTER_SIGNAL_HANDLER(SIGSEGV, SIGSEGV_handler, + "ERROR: Unexpected segmentation fault encountered in " + "DataLoader workers.\n"); +REGISTER_SIGNAL_HANDLER( + SIGBUS, SIGBUS_handler, + "ERROR: Unexpected BUS error encountered in DataLoader worker. " + "This might be caused by insufficient shared memory (shm), " + "please check whether use_shared_memory is set and storage space " + "in /dev/shm is enough\n"); +REGISTER_SIGNAL_HANDLER(SIGFPE, SIGFPE_handler, + "ERROR: Unexpected floating-point exception " + "encountered in DataLoader worker.\n") REGISTER_SPEC_SIGNAL_HANDLER(SIGTERM, SIGTERM_handler); static inline void setSignalHandler(int signal, @@ -105,6 +118,7 @@ static inline void setSignalHandler(int signal, void SetLoadProcessSignalHandler() { setSignalHandler(SIGSEGV, &SIGSEGV_handler, nullptr); setSignalHandler(SIGBUS, &SIGBUS_handler, nullptr); + setSignalHandler(SIGFPE, &SIGFPE_handler, nullptr); setSignalHandler(SIGTERM, &SIGTERM_handler, nullptr); } diff --git a/paddle/fluid/imperative/dygraph_grad_maker.h b/paddle/fluid/imperative/dygraph_grad_maker.h index a367840472827505a7eb48310eb57bfc0c01a3ee..7fefc9ccc67b52aab5073d3dd6c738ab07075e78 100644 --- a/paddle/fluid/imperative/dygraph_grad_maker.h +++ b/paddle/fluid/imperative/dygraph_grad_maker.h @@ -279,6 +279,8 @@ class TracedGradOp { void SetType(const std::string& type) { op_->SetType(type); } + const framework::OperatorBase& InnerOp() const { return op_->InnerOp(); } + void SetAttrMap(const framework::AttributeMap& attrs) { return op_->SetAttrMap(attrs); } diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index deb504a1b657e4f348e47ea9a6e7b80029e109d4..43546cf99c69ffa3aa1f1a792e7b344ed0735a31 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -18,7 +18,6 @@ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/imperative/layer.h" @@ -116,6 +115,23 @@ class TensorAddFunctor : public boost::static_visitor<> { } #endif +#ifdef PADDLE_WITH_ASCEND_CL + void operator()(const platform::NPUPlace& place) { + // TODO(zhiqiu): SUPPORT it + PADDLE_THROW(platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); + } +#else + void operator()(const platform::NPUPlace& place) { + PADDLE_THROW(platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); + } +#endif + // there is NO blas in CUDAPinnedPlace void operator()(const platform::CUDAPinnedPlace& place) { PADDLE_THROW(platform::errors::PermissionDenied( @@ -385,8 +401,8 @@ static platform::Place GetPlaceOfVar( void GradientAccumulator::AccumulateGrad() { /** - * If the gradient has been calculated by previous graph, - * it should be added to the previous graph result. + * If the leaf gradient has been calculated done, the inner_var_ + * should be added to the var_. */ if (!var_->IsLeafGrad() || !SumGradCompleted() || !HasInnerVar()) { return; @@ -397,7 +413,7 @@ void GradientAccumulator::AccumulateGrad() { "this auto-grad")); PADDLE_ENFORCE_EQ(inner_var_->Var().IsInitialized(), true, platform::errors::InvalidArgument( - "Interior var of Leaf tensor should be initialized.")); + "Interior var of Leaf tensor should be initialized.")); auto* src = inner_var_->MutableVar(); auto* dst = var_->MutableVar(); if (!var_->IsEmpty()) { @@ -428,10 +444,65 @@ void GradientAccumulator::AccumulateGrad() { *(dst) = std::move(*src); var_->SetType(inner_var_->Type()); var_->SetDataType(inner_var_->DataType()); + var_->SetIsEmpty(false); } inner_var_.reset(); } +void GradientAccumulator::CallGradientHooks() { + PADDLE_ENFORCE_EQ(var_->IsLeafGrad(), true, + platform::errors::Unavailable( + "Only leaf gradient Tensor can deal with by gradient " + "hook in gradient accumulator.")); + PADDLE_ENFORCE_EQ( + SumGradCompleted(), true, + platform::errors::PreconditionNotMet( + "Only can call gradient hooks after sum gradient completed.")); + PADDLE_ENFORCE_EQ( + HasInnerVar(), true, + platform::errors::PreconditionNotMet( + "Leaf Tensor's inner var is nullptr when call gradient hook.")); + PADDLE_ENFORCE_EQ( + inner_var_->Var().IsInitialized(), true, + platform::errors::PreconditionNotMet("Leaf Tensor's inner var " + "is not initialized when " + "call gradient hook.")); + if (var_->HasVariableWrapperHook()) { + VLOG(3) << "Call " << var_->GetVariableWrapperHooks().size() + << " hooks of leaf gradient accumulator's inner var `" + << var_->Name() << "`."; + auto tmp_var = inner_var_; + VLOG(3) << "Input var " << var_->Name() << "'s hook size - " + << var_->GetVariableWrapperHooks().size(); + for (const auto& hook_pair : var_->GetVariableWrapperHooks()) { + tmp_var = (*hook_pair.second)(tmp_var); + } + inner_var_ = tmp_var; + } +} + +void GradientAccumulator::CallReduceHooks() { + PADDLE_ENFORCE_EQ( + var_->IsLeafGrad(), true, + platform::errors::Unavailable("Only leaf gradient Tensor can deal with " + "by reduce hook in gradient accumulator.")); + PADDLE_ENFORCE_EQ(SumGradCompleted(), true, + platform::errors::PreconditionNotMet( + "Only can call reduce hooks after the gradient " + "summation is completed in current batch.")); + PADDLE_ENFORCE_EQ(HasInnerVar(), false, + platform::errors::PreconditionNotMet( + "Only can call reduce hooks after the " + "gradient accumulation is completed in " + "current batch or across batchs.")); + if (var_->HasVoidHook()) { + for (const auto& hook : var_->GetVoidHooks()) { + VLOG(3) << "call gradient accumulator backward hooks."; + (*hook)(); + } + } +} + void EagerGradientAccumulator::SumGrad(std::shared_ptr var, size_t trace_id, bool unchange_input) { /** diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h index e2dabc06a7dae6f326021dcdef7f3528661e787d..6411dce4405c11795418fb8334e26b32079e7596 100644 --- a/paddle/fluid/imperative/gradient_accumulator.h +++ b/paddle/fluid/imperative/gradient_accumulator.h @@ -40,8 +40,8 @@ class GradientAccumulator { } // inner_var_ record the grad of this auto-grad. - // Only need to generate inner var for non-empty leaf-tensor. - if (var->IsLeafGrad() && !var->IsEmpty()) { + // Only need to generate inner var for leaf-tensor. + if (var->IsLeafGrad()) { inner_var_ = std::make_shared(var->Name()); inner_var_->SetType(var->Type()); inner_var_->SetDataType(var->DataType()); @@ -52,9 +52,6 @@ class GradientAccumulator { << ") to store result of this Graph"; } - // TODO(zhouwei): fix Tensor.clear_gradient() bug, remove this hard flag - var->SetIsEmpty(false); - // var_ is the final grad, processed by hooks and grad accumulation var_ = var; } @@ -93,42 +90,38 @@ class GradientAccumulator { inline bool HasInnerVar() const { return inner_var_ != nullptr; } - /* Hook related methods */ - inline bool HasPostHooks() const { return !post_hooks_.expired(); } - - void SetPostHooks(const std::shared_ptr& hooks) { - PADDLE_ENFORCE_NOT_NULL( - hooks, platform::errors::InvalidArgument( - "The hook set to GradientAccumulator is nullptr.")); - - auto shared_hooks = post_hooks_.lock(); - if (shared_hooks != hooks) { - PADDLE_ENFORCE_EQ( - shared_hooks, nullptr, - platform::errors::PermissionDenied( - "Cannot set post hooks twice to GradientAccumulator.")); - post_hooks_ = hooks; - } - } - // void CallHooks(){} - // ** inner_var_ ** - // function that Sum Gradient with Previous Graph void AccumulateGrad(); - // call backward post hooks, such as reduce hook - void CallBackwardPostHooks() { - PADDLE_ENFORCE_NE( - post_hooks_.expired(), true, - platform::errors::NotFound( - "The post hooks of GradientAccumulator for Tensor `%s` expired.", - var_->Name())); - auto shared_hooks = post_hooks_.lock(); - for (const auto& hook : shared_hooks->backward_hooks()) { - VLOG(3) << "call gradient accumulator backward hooks."; - (*hook)(var_); - } - } + /** [ Hook related methods ] + * + * [Why need two types of VariableWrapperHook? ] + * + * There are two types of gradient accumulation: + * 1. Gradient accumulation in same batch + * 2. Gradient accumulation across batchs + * The order of execution between Hooks and gradient accumulation: + + * [ Gradient accumulation in same batch] + * | + * [ leaf GradVarBase hooks ] + * | + * [ Gradient accumulation across batchs ] + * | + * [ Gradient reduce / allreduce hooks ] + + * Because we currently intend to accumulate these two gradient + * accumulation in one GradientAccumulator, We must distinguish between + * two types of hooks. + + * And the InplaceVariableWrapperHook does not allow users to register + * directly, and is currently only used to support the reduce strategy of + * parallel multi-card training. + */ + + void CallGradientHooks(); + + void CallReduceHooks(); protected: VariableWrapper* var_; @@ -137,7 +130,6 @@ class GradientAccumulator { std::shared_ptr inner_var_; size_t ref_cnt_{0}; size_t cur_cnt_{0}; - std::weak_ptr post_hooks_; }; class EagerGradientAccumulator : public GradientAccumulator { diff --git a/paddle/fluid/imperative/hooks.h b/paddle/fluid/imperative/hooks.h index 1211ec6ae6c7bdd28a08687ed65e98b5741e2865..fa929b7c7a51c77eaf307ab2900f58fc452e6969 100644 --- a/paddle/fluid/imperative/hooks.h +++ b/paddle/fluid/imperative/hooks.h @@ -18,215 +18,63 @@ #include #include #include - -#include "paddle/fluid/imperative/type_defs.h" -#include "paddle/fluid/platform/macros.h" - namespace paddle { namespace imperative { class VariableWrapper; -/** [ Basic hook classes ] - * s - * @brief OpBasePreHook is executed before the grad OpBase is executed, - * taking the input of the current grad OpBase as input, and - * executing python hooks (user-defined) or C++ hooks (developer-defined) - * to achieve the purpose of custom operations on the interior VarBase - * gradient. +/** [ VariableWrapper Hook ] * - * @note OpBasePreHook will not change the input gradient VarBase. + * @brief This hook functor is executed before the grad OpBase is executed or + * after gradient accumulation completed in current batch. + * 1. For interior var, VariableWrapper Hook take the input of the + * current grad OpBase as input. + * 2. For leaf var, VariableWrapper Hook take the inner_var_ of + * GradientAccumulator as input. * - * @note [Why need to be OpBase `PreHook`, why not `PostHook`?] + * @note This hook functor will not change the input gradient VariableWrapper, + * but if you copy the input VariableWrapper and change the value of + * Variable in VariableWrapper, the value of input will also be changed, + * because they shared same PlaceHolder. * - * If set OpBase post hook, when the op executed end, the op's output - * gradient may not be the final state, because it may need other op's - * gradient output to accumulated to it. But before op can be executed, - * the gradient output must have been accumulated to final value. + * @note [ Why need to be OpBase `PreHook`, why not `PostHook`? ] * - * @note [Why only can be used for interior VarBase?] + * We expect If set OpBase post hook, when the op executed end, the + * op's output gradient may not be the final state, because it may need + * other op's gradient output to accumulated to it. But before op can + * be executed, the gradient output must have been accumulated to final + * value. + * + * @note [ Why Leaf gradient is special? ] * * Because the leaf VarBase's GradVarBase has no GradOpNode, so leaf * GradVarBase has no next OpBase to executed, so if need to deal with - * the leaf GradVarBase, cannot use OpBasePreHook. For this case, we - * deal with by GradAccumulatorPostHook. + * the leaf GradVarBase, we should call hooks after gradient accumulation + * completed. */ -class OpBasePreHook { +class VariableWrapperHook { public: - virtual ~OpBasePreHook() = default; - virtual VariableWrapperList operator()( - const VariableWrapperList& grad_inputs) = 0; + virtual ~VariableWrapperHook() = default; + virtual std::shared_ptr operator()( + const std::shared_ptr& var) = 0; }; -/** - * @brief GradAccumulatorPostHook is the Hook that operates on the current - * gradientafter the GradientAccumulator has accumulated the gradient. - * Leaf GradVarBase has no next OpBase, if we want to register hook - * for it, we also need to wait until the leaf GradVarBase accumulation - * is completed, so we can add post hook to GradientAccumulator. - * - * @note GradAccumulatorPostHook will change the grad VarBase value. - * - * @note Only allow leaf VarBase hold GradientAccumulatorPostHook. - */ -class GradAccumulatorPostHook { - public: - virtual ~GradAccumulatorPostHook() = default; - virtual void operator()(VariableWrapper* var) = 0; -}; - -/** [ Hook for cpp functions ] - * - * Here we design three C++ hooks; - * 1. CppOpBasePreHook (Implement later): - * - used for developer-defined C++ interior VarBase hooks - * 2. CppGradAccumulatorPostHook (Implement later): - * - used for developer-defined C++ leaf VarBase hooks - * 3. LambdaGradAccumulatorPostHook: - * - used for VarBase reduce in parallel training - * - * @note [Why need two types of GradAccumulatorPostHook? ] - * - * There are two types of gradient accumulation: - * 1. Gradient accumulation in same batch - * 2. Gradient accumulation across batchs - * The order of execution between Hooks and gradient accumulation: - * - * [ Gradient accumulation in same batch] - * | - * [ leaf GradVarBase hooks ] - * | - * [ Gradient accumulation across batchs ] - * | - * [ Gradient reduce / allreduce] - * - * Because we currently intend to accumulate these two gradient - * accumulation in one GradientAccumulator, We must distinguish between - * two types of hooks. - * - * And the LambdaGradAccumulatorPostHook does not allow users to register - * directly, and is currently only used to support the reduce strategy of - * parallel multi-card training. - */ -class LambdaGradAccumulatorPostHook : public GradAccumulatorPostHook { +class CppVariableWrapperHook : public VariableWrapperHook { public: - explicit LambdaGradAccumulatorPostHook( - std::function fn) + explicit CppVariableWrapperHook( + std::function( + const std::shared_ptr&)>&& fn) : fn_(std::move(fn)) {} - void operator()(VariableWrapper* var) override { fn_(var); } - - private: - std::function fn_; -}; - -/* Hooks for python function: in pybind/imperative.cc */ - -/** Add Python Hooks later: - * - PyOpBasePreHook (Implement later): used for user-defined interior python - * VarBase hooks - * - PyGradAccumulatorPostHook (Implement later): used for user-defined leaf - * python VarBase hooks - */ - -/** [ Hook Pipeline classes ] - * - * @note [Why need hook pipeline classes?] - * - * There are 2 purposes for adding Hook pipeline here: - * - * 1. Make the code implementation cleaner. - * - * If there are no Hook pipeline, we need to add 3 hook vector into - * VariableWrapper, 1 hook vector into OpBase, 2 hook vector into - * GradientAccumulator, like: - * - * - VariableWrapper: - * std::vector> - * interior_var_hooks_; - * std::vector> - * leaf_var_hooks_; - * std::vector> - * backward_hooks_; - * - * - OpBase: - * std::vector> - * interior_var_hooks_; - * - * - GradientAccumulator: - * std::vector> - * leaf_var_hooks_; - * std::vector> - * backward_hooks_; - * - * This seems more complicated, and std::vector> - * is not easy to destruct. - * - * 2. Make the code easier to understand. - * - * From these two packages, we can clearly understand that we - * have two types of Hooks, respectively for the interior - * gradient var and leaf gradient var inside the backward - * calculation graph. - */ - -class InteriorVarHookPipeline { - public: - InteriorVarHookPipeline() = default; - - void add_hook(std::unique_ptr&& hook) { - hooks_.emplace_back(std::move(hook)); + std::shared_ptr operator()( + const std::shared_ptr& var) override { + return fn_(var); } - const std::vector>& hooks() const { - return hooks_; - } - - std::vector>& hooks() { return hooks_; } - private: - std::vector> hooks_; - - DISABLE_COPY_AND_ASSIGN(InteriorVarHookPipeline); -}; - -class LeafVarHookPipeline { - public: - LeafVarHookPipeline() = default; - - void add_hook(std::unique_ptr&& hook) { - hooks_.emplace_back(std::move(hook)); - } - - const std::vector>& hooks() const { - return hooks_; - } - - std::vector>& hooks() { - return hooks_; - } - - void add_backward_hook(std::unique_ptr&& hook) { - backward_hooks_.emplace_back(std::move(hook)); - } - - const std::vector>& backward_hooks() - const { - return backward_hooks_; - } - - std::vector>& backward_hooks() { - return backward_hooks_; - } - - private: - std::vector> hooks_; - // NOTE: the `backward` here means the `whole backward process`, - // the `backward_hooks_` need to be executed after the `whole backward - // process`. - std::vector> backward_hooks_; - - DISABLE_COPY_AND_ASSIGN(LeafVarHookPipeline); + std::function( + const std::shared_ptr&)> + fn_; }; } // namespace imperative diff --git a/paddle/fluid/imperative/jit/program_desc_tracer.cc b/paddle/fluid/imperative/jit/program_desc_tracer.cc index 53750f7bf02be206d8f6524bf1b1894c570978d1..1a44f50275ef8f524f4291468cb25b5a4bd59e85 100644 --- a/paddle/fluid/imperative/jit/program_desc_tracer.cc +++ b/paddle/fluid/imperative/jit/program_desc_tracer.cc @@ -69,6 +69,7 @@ UniqueBlockVarGenerator::UniqueBlockVarGenerator( std::string UniqueBlockVarGenerator::NameOf(const std::weak_ptr &var, const std::string &prefix) { + VLOG(3) << "Finding: " << var.lock()->Name(); auto all_vars_iter = all_vars_.find(var); PADDLE_ENFORCE_EQ(all_vars_iter != all_vars_.end(), true, platform::errors::NotFound( @@ -111,6 +112,15 @@ void UniqueBlockVarGenerator::InsertNewVarInBlock( } } +bool ProgramDescTracer::ContainVar(const std::weak_ptr &var) const { + auto vars_iter = vars_.find(var); + bool ret = (vars_iter != vars_.end()); + if (!ret) { + VLOG(5) << "Can't found variable: " << var.lock()->Name(); + } + return ret; +} + void ProgramDescTracer::InsertOp(const std::string &type, const NameVarBaseMap &inputs, const NameVarBaseMap &outputs, @@ -147,12 +157,16 @@ TracedProgramTuple ProgramDescTracer::CreateProgramDesc( std::vector feed_var_names; for (auto &feed_var : feed_vars) { - feed_var_names.emplace_back(generator.NameOf(feed_var, feed_prefix)); + if (ContainVar(feed_var)) { + feed_var_names.emplace_back(generator.NameOf(feed_var, feed_prefix)); + } } std::vector fetch_var_names; for (auto &fetch_var : fetch_vars) { - fetch_var_names.emplace_back(generator.NameOf(fetch_var, fetch_prefix)); + if (ContainVar(fetch_var)) { + fetch_var_names.emplace_back(generator.NameOf(fetch_var, fetch_prefix)); + } } for (auto &op : ops_) { @@ -164,7 +178,9 @@ TracedProgramTuple ProgramDescTracer::CreateProgramDesc( std::vector names; names.reserve(pair.second.size()); for (auto &var : pair.second) { - names.emplace_back(generator.NameOf(var, tmp_prefix)); + if (ContainVar(var)) { + names.emplace_back(generator.NameOf(var, tmp_prefix)); + } } op_desc->SetInput(pair.first, std::move(names)); @@ -174,7 +190,9 @@ TracedProgramTuple ProgramDescTracer::CreateProgramDesc( std::vector names; names.reserve(pair.second.size()); for (auto &var : pair.second) { - names.emplace_back(generator.NameOf(var, tmp_prefix)); + if (ContainVar(var)) { + names.emplace_back(generator.NameOf(var, tmp_prefix)); + } } op_desc->SetOutput(pair.first, std::move(names)); diff --git a/paddle/fluid/imperative/jit/program_desc_tracer.h b/paddle/fluid/imperative/jit/program_desc_tracer.h index 8e2e59a49ed7be473a1f89aaefde6bf123a9dea9..b231efb0e53a515fcd2e6c58c62e49f7bdccf1db 100644 --- a/paddle/fluid/imperative/jit/program_desc_tracer.h +++ b/paddle/fluid/imperative/jit/program_desc_tracer.h @@ -66,7 +66,7 @@ class ProgramDescTracer { const std::string &feed_prefix, const std::vector> &fetch_vars, const std::string &fetch_prefix, const std::string &tmp_prefix) const; - + bool ContainVar(const std::weak_ptr &var) const; void Reset(); private: diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 062f04c6b7052f8ce9032df5809f5ada86e4b777..a4af3117d3e32ea8db37881bef9c4423ba0173ca 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -187,6 +187,7 @@ size_t VarBase::GradOpNum() const { } void VarBase::ClearGradient() { + VLOG(4) << "ClearGradient " << Name(); if (grad_var_) { if (grad_var_->Var().IsType()) { auto* grad_t = @@ -406,7 +407,7 @@ void OpBase::Run(const framework::OperatorBase& op, OpBaseRunImpl(op, ins, outs, attrs, place); } -static void ClearNoNeedBufferInputs(OpBase* op) { +void ClearNoNeedBufferInputs(OpBase* op) { auto& inferer = op->Info().NoNeedBufferVarsInferer(); if (!inferer) return; auto* ins = op->GetMutableInsMap(); diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index ff5a780a5f9dbf4ae7799d5b249838362310fc08..bbede47e36429887b70c7a7310176c38f6d41a52 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -30,6 +30,7 @@ #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/imperative/flags.h" +#include "paddle/fluid/imperative/hooks.h" #include "paddle/fluid/imperative/saved_variable_wrapper_list.h" #include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/imperative/variable_wrapper.h" @@ -107,6 +108,10 @@ class VarBase { void ClearGradVarBase() { grad_var_ = nullptr; } + void SetGradVarBase(VarBase& grad_var) { + MutableGradVarBase()->CopyFrom(grad_var, true); + } + const std::shared_ptr& MutableGradVarBase() { if (grad_var_ == nullptr) { if (auto grad_var_wrapper = var_->GetGradVar()) { @@ -220,6 +225,28 @@ class VarBase { void BumpInplaceVersion(); + /* Hook related method: now only used for GradVarBase */ + bool HasVariableWrapperHook() const { return var_->HasVariableWrapperHook(); } + + int64_t AddVariableWrapperHook(std::shared_ptr&& hook) { + return var_->AddVariableWrapperHook( + std::forward>(hook)); + } + + bool RemoveVariableWrapperHook(const int64_t& hook_id) { + return var_->RemoveVariableWrapperHook(hook_id); + } + + const std::map>& + GetVariableWrapperHooks() const { + return var_->GetVariableWrapperHooks(); + } + + void AddVoidHook(std::shared_ptr>&& hook) { + var_->AddVoidHook( + std::forward>>(hook)); + } + private: /** * NOTE(zengjinle): never remove the const qualifier of `var_` if you are @@ -259,5 +286,7 @@ std::shared_ptr CreateGradOpNode( const platform::Place& place, const std::map& inplace_map); +void ClearNoNeedBufferInputs(OpBase* op); + } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc index eb0135d15e0743ef003b846a8a60a24385be7eea..9f036742f0f5dd4113a92a67980484eca2da3965 100644 --- a/paddle/fluid/imperative/nccl_context.cc +++ b/paddle/fluid/imperative/nccl_context.cc @@ -35,7 +35,7 @@ namespace imperative { void NCCLParallelContext::BcastNCCLId( std::vector &nccl_ids, // NOLINT - int root) { + int root, int server_fd) { if (strategy_.local_rank_ == root) { std::vector other_trainers; for (auto &ep : strategy_.trainer_endpoints_) { @@ -45,11 +45,14 @@ void NCCLParallelContext::BcastNCCLId( } platform::SendBroadCastCommID(other_trainers, &nccl_ids); } else { - platform::RecvBroadCastCommID(strategy_.current_endpoint_, &nccl_ids); + platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_, + &nccl_ids); } } void NCCLParallelContext::Init() { + int server_fd = -1; + std::vector nccl_ids; nccl_ids.resize(strategy_.nrings_); @@ -58,8 +61,13 @@ void NCCLParallelContext::Init() { for (size_t i = 0; i < nccl_ids.size(); ++i) { platform::dynload::ncclGetUniqueId(&nccl_ids[i]); } + } else { + // FIXME(wangxi): gloo will use rank0 endpoint, so not create socket server + // on rank0. + server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_) + .socket(); } - BcastNCCLId(nccl_ids, 0); + BcastNCCLId(nccl_ids, 0, server_fd); int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device; for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) { @@ -79,6 +87,36 @@ void NCCLParallelContext::Init() { } } +void NCCLParallelContext::InitWithRingID(int ring_id) { + int server_fd = -1; + std::vector nccl_ids; + nccl_ids.resize(1); + + if (strategy_.local_rank_ == 0) { + // generate the unique ncclid on the root worker + platform::dynload::ncclGetUniqueId(&nccl_ids[0]); + } else { + // FIXME(wangxi): gloo will use rank0 endpoint, so not create socket server + // on rank0. + server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_) + .socket(); + } + BcastNCCLId(nccl_ids, 0, server_fd); + + int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device; + VLOG(0) << "init nccl context nranks: " << strategy_.nranks_ + << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id + << " ring id: " << ring_id; + // it will assign nccl_comm in CUDADeviceContext within ring_id + platform::NCCLCommContext::Instance().CreateNCCLComm( + &nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id); + + compute_events_.emplace_back(platform::CudaEventResourcePool::Instance().New( + BOOST_GET_CONST(platform::CUDAPlace, place_).device)); + comm_events_.emplace_back(platform::CudaEventResourcePool::Instance().New( + BOOST_GET_CONST(platform::CUDAPlace, place_).device)); +} + void NCCLParallelContext::AllReduceByStream(const framework::Variable &src, framework::Variable *dst, int ring_id, bool use_calc_stream) { @@ -149,6 +187,12 @@ void NCCLParallelContext::WaitComm(int ring_id) { #endif } +void NCCLParallelContext::SynchronizeCompute() { + auto *compute_dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place_)); + compute_dev_ctx->Wait(); +} + #endif } // namespace imperative diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h index 51e5743aebdc3dd333f40f8ec59d1bb35f620843..1eee393aa714bb21ab77ee6668d719b93787981f 100644 --- a/paddle/fluid/imperative/nccl_context.h +++ b/paddle/fluid/imperative/nccl_context.h @@ -49,10 +49,13 @@ class NCCLParallelContext : public ParallelContext { ~NCCLParallelContext() override = default; - void BcastNCCLId(std::vector& nccl_ids, int root); // NOLINT + void BcastNCCLId(std::vector& nccl_ids, int root, // NOLINT + int server_fd); void Init() override; + void InitWithRingID(int ring_id) override; + void AllReduceByStream(const framework::Variable& src, framework::Variable* dst, int ring_id, bool use_calc_stream) override; @@ -63,6 +66,8 @@ class NCCLParallelContext : public ParallelContext { void WaitComm(int ring_id) override; + void SynchronizeCompute() override; + private: // used for comm wait compute, compute_stream-->event-->comm_stream[ring_id] std::vector> compute_events_; diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h index 2b7642ae7cfd92fd323dfdfeced08102bf942d42..0164ff9313cdfe2344f98610602a6bd40a5e903a 100644 --- a/paddle/fluid/imperative/op_base.h +++ b/paddle/fluid/imperative/op_base.h @@ -177,8 +177,6 @@ class OpBase { std::unique_ptr op_; platform::Place place_; size_t id_{-1UL}; - - std::weak_ptr pre_hooks_; }; class GradOpNode { diff --git a/paddle/fluid/imperative/parallel_context.h b/paddle/fluid/imperative/parallel_context.h index ef0a960409215181ec7b7bc38e5ee55b540502b9..f537a316014d60ed18250d72de3ec2b7dd95cf05 100644 --- a/paddle/fluid/imperative/parallel_context.h +++ b/paddle/fluid/imperative/parallel_context.h @@ -50,6 +50,8 @@ class ParallelContext { virtual void Init() = 0; + virtual void InitWithRingID(int ring_id) = 0; + virtual void AllReduceByStream(const framework::Variable& src, framework::Variable* dst, int ring_id, bool use_calc_stream) = 0; @@ -64,6 +66,9 @@ class ParallelContext { // if CPU, should do nothing. virtual void WaitComm(int ring_id) = 0; + // synchorize compute stream + virtual void SynchronizeCompute() = 0; + inline int GetNRings() const { return strategy_.nrings_; } inline int64_t GetNRanks() const { return strategy_.nranks_; } diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc index 8dd8cafc835ab17b05258fcdd7f9393584329da4..3da3a05ed1071cae20cf16ebfed6f6310937daae 100644 --- a/paddle/fluid/imperative/partial_grad_engine.cc +++ b/paddle/fluid/imperative/partial_grad_engine.cc @@ -369,6 +369,10 @@ class GradientAccumulationInfo { *is_finished = (cur_ref_cnt_ == total_ref_cnt_); accumulator_->SumGrad(grad_var_partial, trace_id, unchange_input); + if (*is_finished && accumulator_->HasInnerVar()) { + accumulator_->AccumulateGrad(); + } + if (create_graph_) { VLOG(10) << "Store partial grad grad for double grad " << mapped_grad_var_->Name(); diff --git a/paddle/fluid/imperative/py_layer_fwd.h b/paddle/fluid/imperative/py_layer_fwd.h new file mode 100644 index 0000000000000000000000000000000000000000..de5f9d75e9173a7d39c113b881e078dc43c83f39 --- /dev/null +++ b/paddle/fluid/imperative/py_layer_fwd.h @@ -0,0 +1,177 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/imperative/tracer.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/operators/py_layer_op.h" + +namespace paddle { +namespace imperative { + +namespace py = ::pybind11; + +bool RequiredGrad(const NameVarBaseMap& ins, const NameVarBaseMap& outs) { + for (const auto& name_pair : ins) { + for (const auto& var_base : name_pair.second) { + if (!var_base->OverridedStopGradient()) { + PassStopGradient(outs, var_base->OverridedStopGradient()); + return true; + } + } + } + return false; +} + +std::shared_ptr CreateGradOpNode( + const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, const framework::AttributeMap& attrs, + const platform::Place& place, + const std::map& inplace_map, + const std::shared_ptr& py_context) { + operators::PyLayerGradOpMaker maker( + type, ins, outs, attrs, inplace_map); + + maker.SetPyLayerContext(py_context); + auto grad_node = maker(); + if (grad_node && !grad_node->empty()) { + for (auto& grad_op : *grad_node) { + grad_op.SetId(OpBase::GenerateUniqueId()); + grad_op.SetPlace(place); + ClearNoNeedBufferInputs(&grad_op); + } + return grad_node; + } else { + return nullptr; + } +} + +py::object PyLayerApply(const platform::Place& place, const py::handle& cls, + const py::args args, const py::kwargs kwargs) { + py::gil_scoped_acquire guard; + auto bk_function = cls.attr("_backward_function"); + auto context = bk_function(); + auto forward = cls.attr("forward"); + + auto result_forward = forward(context, *args, **kwargs); + std::shared_ptr py_layer_ctx = + std::make_shared(context.ptr()); + // make inputs to varbase + std::vector> input_vars; + // process args,`input_vars` only collect `imperative::VarBase` + if (!args.empty()) { + for (auto ptr = args.begin(); ptr != args.end(); ptr++) { + try { + if (Py_None != ptr->ptr()) { + auto a = ptr->cast>(); + input_vars.push_back(a); + } + } catch (py::cast_error& err) { + // Only collect Tensor type in 'args' and pass them to backward. Ignore + // other types of input temporarily. + } + } + } + // process kwargs, only collect `imperative::VarBase` + if (!kwargs.empty()) { + for (auto ptr = kwargs.begin(); ptr != kwargs.end(); ptr++) { + try { + if (Py_None != ptr->second.ptr()) { + auto a = ptr->second.cast>(); + input_vars.push_back(a); + } + } catch (py::cast_error&) { + // Only collect Tensor type in 'kwargs' and pass them to backward. + // Ignore other types of input temporarily. + } + } + } + NameVarBaseMap ins = {{"X", input_vars}}; + + std::vector> output_vars; + if (PyTuple_Check(result_forward.ptr()) || + PyList_Check(result_forward.ptr())) { + auto tuple_result = result_forward.cast(); + for (size_t i = 0; i < tuple_result.size(); i++) { + if (Py_None != tuple_result[i].ptr()) { + try { + auto temp_out = + tuple_result[i].cast>(); + output_vars.push_back(temp_out); + } catch (py::cast_error&) { + // Only collect Tensor type in 'kwargs' and pass them to backward. + // Ignore other types of input temporarily. + } + } else { + // Only collect Tensor type in 'kwargs' and pass them to backward. + // Ignore other types of input temporarily. + } + } + } else { + if (Py_None != result_forward.ptr()) { + try { + auto temp_out = + result_forward.cast>(); + output_vars.push_back(temp_out); + } catch (py::cast_error&) { + // Only collect Tensor type in 'kwargs' and pass them to backward. + // Ignore other types of input temporarily. + } + } else { + // Only collect Tensor type in 'kwargs' and pass them to backward. + // Ignore other types of input temporarily. + } + } + if (output_vars.size() == 0) { + PADDLE_THROW(platform::errors::InvalidArgument( + "At least one output of `PyLayer.forward` is a `Tensor`.")); + } + + NameVarBaseMap outs = {{"Out", output_vars}}; + + if (RequiredGrad(ins, outs)) { + std::map inplace_map{}; + bool if_inplace = false; + for (auto temp_ins : input_vars) { + if (if_inplace) { + break; + } + for (auto temp_outs : output_vars) { + if (temp_ins->Name() == temp_outs->Name()) { + if_inplace = true; + break; + } + } + } + if (if_inplace) { + inplace_map["X"] = "Out"; + } + + CreateGradOpNode("py_layer", ins, outs, {{}}, place, inplace_map, + py_layer_ctx); + } else { + VLOG(3) << "No Grad to track for Op: py_layer_op"; + } + + return result_forward; +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index e8b531d35cabfce8873f0f5b4142d95582188de4..0f6676ed48f349c7aa8d66459f7c74355bf53a9b 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -297,7 +297,7 @@ Reducer::Reducer(const std::vector> &vars, is_sparse_gradient_(is_sparse_gradient), parallel_ctx_(parallel_ctx), group_size_limits_(group_size_limits), - find_unused_vars_(find_unused_vars) { + find_unused_vars_each_step_(find_unused_vars) { VLOG(3) << "Start construct the Reducer ..."; nrings_ = parallel_ctx->GetNRings(); nranks_ = parallel_ctx->GetNRanks(); @@ -310,13 +310,16 @@ Reducer::Reducer(const std::vector> &vars, for (size_t global_var_index = 0; global_var_index < vars_.size(); ++global_var_index) { auto var = vars_[global_var_index]; - var->SharedVar()->AddGradVarLeafBackwardHook( - std::unique_ptr( - new LambdaGradAccumulatorPostHook([=](VariableWrapper *grad) { - this->AddDistHook(global_var_index); - }))); + var->GradVarBase()->AddVoidHook(std::make_shared>( + [=]() { this->AddDistHook(global_var_index); })); var_index_map_[var->GradVarBase()->SharedVar().get()] = global_var_index; } + + // for checking var is ready once + vars_marked_ready_.resize(vars_.size(), false); + + // Initialize local used vars + local_used_vars_.resize(vars_.size(), 0); } void Reducer::InitializeDenseGroups( @@ -325,7 +328,7 @@ void Reducer::InitializeDenseGroups( for (size_t index = 0; index < variable_indices_.size(); ++index) { const auto variable_index = variable_indices_[index]; const auto &var = vars_[variable_index]; - const auto var_name = var->Name(); + const auto &var_name = var->Name(); PADDLE_ENFORCE_EQ(is_sparse_gradient_[variable_index], false, platform::errors::PreconditionNotMet( "Tensor %s's GRAD must be LoDTensor, but received " @@ -336,7 +339,7 @@ void Reducer::InitializeDenseGroups( PADDLE_ENFORCE_EQ(lod_tensor->IsInitialized(), true, platform::errors::PreconditionNotMet( "Tensor %s is not initialized.", var_name)); - auto size = lod_tensor->numel(); + const auto size = lod_tensor->numel(); PADDLE_ENFORCE_GT( size, 0, platform::errors::PreconditionNotMet( "The number of tensor %s's elements is 0.", var_name)); @@ -348,8 +351,8 @@ void Reducer::InitializeDenseGroups( p_group->dense_tensors_.push_back(framework::Tensor()); // check the dtype and place, it must be same. - auto dtype = var->DataType(); - auto place = var->Place(); + const auto &dtype = var->DataType(); + const auto &place = var->Place(); if (index > 0) { PADDLE_ENFORCE_EQ( dtype, p_group->dtype_, @@ -419,8 +422,7 @@ void Reducer::InitializeGroups( group.variable_indices_ = std::move(variable_indices_); groups_.emplace_back(std::move(group)); // Debug Message For Reducer - VLOG(3) << "The Group[" << group_index << "]:"; - VLOG(3) << groups_.back(); + VLOG(3) << "The Group[" << group_index << "]:" << groups_.back(); } } @@ -441,10 +443,6 @@ void Reducer::PrepareDeps(const std::unordered_set &init_nodes) { auto *cur_node = q.front(); q.pop(); - for (auto &cur_op : *cur_node) { - cur_op.EnforceHasInOut(); - } - const auto &grad_pending_nodes = cur_node->GradPendingNodes(); for (auto &grad_pending_node : grad_pending_nodes) { PADDLE_ENFORCE_NOT_NULL( @@ -459,38 +457,8 @@ void Reducer::PrepareDeps(const std::unordered_set &init_nodes) { } } -// After each batch is calculated, the counter of each group(group.pending_) -// and allreudce sequence counter(next_group_) will be cleaned up again. -void Reducer::PrepareForBackward( +void Reducer::TraverseBackwardGraph( const std::vector> &outputs) { - VLOG(3) << "start reseting count.."; - next_group_ = 0; - std::for_each(groups_.begin(), groups_.end(), [](Group &group) { - group.pending_ = group.variable_indices_.size(); - group.sparse_contents_ = nullptr; - }); - - PADDLE_ENFORCE_EQ( - all_group_ready_, false, - platform::errors::PreconditionNotMet( - "Please note that all forward outputs derived from the module " - "parameters must participate in the calculation of losses and " - "subsequent gradient calculations. If not, the wrapper will hang, " - "waiting for autograd to generate gradients for these parameters. " - "you can use detach or stop_gradient to make the unused parameters " - "detached from the autograd graph.")); - - // The first var to trigger the unused parameter - has_marked_unused_vars_ = false; - if (!find_unused_vars_) { - return; - } - - // TODO(shenliang03) "find_unused_vars" interface will be exposed in the - // future to handle control flow to process unused parameters - find_unused_vars_ = false; - - unused_vars_.clear(); node_deps_.clear(); std::queue> q; std::unordered_set var_visited; @@ -517,7 +485,6 @@ void Reducer::PrepareForBackward( q.pop(); for (const auto &cur_op : *cur_node) { - cur_op.EnforceHasInOut(); auto &bwd_outs = cur_op.GetOutsMap(); for (const auto &pair : bwd_outs) { if (!pair.second.IsGrad()) { @@ -555,6 +522,67 @@ void Reducer::PrepareForBackward( } } +// After each batch is calculated, the counter of each group(group.pending_) +// and allreudce sequence counter(next_group_) will be cleaned up again. +void Reducer::PrepareForBackward( + const std::vector> &outputs) { + VLOG(3) << "after forward, then reset count for backward."; + next_group_ = 0; + std::for_each(groups_.begin(), groups_.end(), [](Group &group) { + group.pending_ = group.variable_indices_.size(); + group.sparse_contents_ = nullptr; + }); + + // reinitialize vars_marked_ready_ for next iteration + vars_marked_ready_.clear(); + vars_marked_ready_.resize(vars_.size(), false); + + PADDLE_ENFORCE_EQ( + groups_need_finalize_, false, + platform::errors::PreconditionNotMet( + "A serious error has occurred here. Please " + "set find_unused_parameters=True to traverse backward graph " + "in each step to prepare reduce in advance. If you have " + "set, There may be several reasons for this error: " + "1) Please note that all forward outputs derived from the module " + "parameters must participate in the calculation of losses and " + "subsequent gradient calculations. If not, the wrapper will hang, " + "waiting for autograd to generate gradients for these parameters. " + "you can use detach or stop_gradient to make the unused parameters " + "detached from the autograd graph. " + "2) Used multiple forwards and one backward. You may be able to wrap " + "multiple forwards in a model.")); + + // The first var to trigger the unused parameter + has_marked_unused_vars_ = false; + + if (find_unused_vars_once_ || find_unused_vars_each_step_) { + unused_vars_.clear(); + TraverseBackwardGraph(outputs); + // only check once in first step + find_unused_vars_once_ = false; + } + + if (find_unused_vars_each_step_ && unused_vars_.empty()) { + LOG_FIRST_N(WARNING, 1) + << "All parameters are involved in the backward pass. " + "It is recommended to set find_unused_parameters to False " + "to improve performance. However, if unused parameters " + "appear in subsequent iterative training, then an error " + "will occur. Please make it clear that in the subsequent " + "training, there will be no parameters that are not used " + "in the backward pass, and then set find_unused_parameters"; + } + + if (unused_vars_.size() == vars_.size()) { + LOG_FIRST_N(WARNING, 1) + << "There is no parameter in the device involved " + "in the backward calculation. If there are " + "parameters on other devices involved in the " + "backward, then a serious error will occur here."; + } +} + // Add hook function to each leaf node. When the gradient of a leaf node is // generated, if it is the sparse parameter, it will directly execute allreduce, // if it is the dense parameter, it will execute three steps: 1, @@ -565,67 +593,141 @@ void Reducer::PrepareForBackward( // concat + allreduce + split is emitted in turn according to next_group_. // 3, FinalizeBackward: after the end, synchronize each stream. void Reducer::AddDistHook(size_t var_index) { + PADDLE_ENFORCE_LT(var_index, variable_locators_.size(), + platform::errors::OutOfRange( + "Out of bounds variable index. it must be less" + "than %d, but it is %d", + variable_locators_.size(), var_index)); + VLOG(3) << "Var[" << var_index << "] [" << vars_[var_index]->GradVarBase()->Name() << "] arrived and triggered disthook"; + + local_used_vars_[var_index] = 1; + + // rebuild group when find_unused_vars_each_step_ is false + if (NeedRebuildGroup()) { + rebuild_vars_.push_back(vars_[var_index]); + rebuild_var_indices_.push_back(var_index); + } + if (!has_marked_unused_vars_) { has_marked_unused_vars_ = true; - for (auto unused_index : unused_vars_) { - if (NeedRebuildGroup()) { - rebuild_vars_.push_back(vars_[unused_index]); - rebuild_var_indices_.push_back(unused_index); - } + for (const auto &unused_index : unused_vars_) { MarkVarReady(unused_index, false); } } - if (NeedRebuildGroup()) { - rebuild_vars_.push_back(vars_[var_index]); - rebuild_var_indices_.push_back(var_index); - } MarkVarReady(var_index, true); } void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) { - all_group_ready_ = true; + groups_need_finalize_ = true; + const auto &var_locator = variable_locators_[var_index]; - auto group_index = var_locator.group_index; + const auto group_index = var_locator.group_index; auto &group = groups_[group_index]; + // error happened, if the var is ready before. + if (vars_marked_ready_[var_index]) { + auto error_info = string::Sprintf( + "Error happened, when parameter[%d][%s] has been ready before. " + "Please set find_unused_parameters=True to traverse backward graph " + "in each step to prepare reduce in advance. If you have set, " + "there may be several reasons for this error: " + "1) In multiple reentrant backward phase, some parameters are reused." + "2) Using model parameters outside of forward function. Please " + "make sure that model parameters are not shared in concurrent " + "forward-backward passes.", + var_index, vars_[var_index]->GradVarBase()->Name()); + + PADDLE_ENFORCE_EQ(has_marked_unused_vars_, false, + platform::errors::PreconditionNotMet(error_info)); + + error_info += + "3) Unused parameters retrieval is incorrect. " + "The return value of forward will be used to retrieve" + " the unused parameters of the entire model. These " + "gradients of unused parameters will not be synchronized " + "between multiple cards. However, if the unused " + "parameters participate in the backward calculation " + "again at a later time (e.g. after the forward function, " + "the loss calculation uses the unused " + "paramters of the forward and trigger backward), " + "its gradient will be wrong."; + + PADDLE_ENFORCE_EQ(has_marked_unused_vars_, true, + platform::errors::PreconditionNotMet(error_info)); + } else { + vars_marked_ready_[var_index] = true; + } + if (!group.is_sparse_) { // process dense group - auto inside_group_index = var_locator.inside_group_index; - auto length = group.length_[inside_group_index]; + const auto inside_group_index = var_locator.inside_group_index; + const auto length = group.length_[inside_group_index]; auto &group_tensor = group.dense_tensors_[inside_group_index]; + if (is_used_var) { - auto var_warpper = vars_[var_index]->GradVarBase()->SharedVar(); - auto tensor = - var_warpper->MutableVar()->GetMutable(); + auto var_base = vars_[var_index]->GradVarBase(); + auto tensor = var_base->MutableVar()->GetMutable(); group_tensor.ShareDataWith(*tensor).Resize( {static_cast(length)}); } else { + // TODO(shenliang03): maybe save the memory + // by avoiding tensor construction if (!group_tensor.IsInitialized()) { group_tensor.Resize({static_cast(length)}); group_tensor.mutable_data(place_, group.dtype_); + } + #ifdef PADDLE_WITH_XPU_BKCL - if (platform::is_xpu_place(group_tensor.place())) { - // TODO(liuyuhui) support XPU set constant - VLOG(3) << "XPU doesn't support set_constant"; - } + if (platform::is_xpu_place(group_tensor.place())) { + // TODO(liuyuhui) support XPU set constant + VLOG(3) << "XPU doesn't support set_constant"; + } #else - auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_); + auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_); + if (HasGrad(var_index)) { + auto var_base = vars_[var_index]->GradVarBase(); + auto tensor = + var_base->MutableVar()->GetMutable(); + TensorCopy(*tensor, place_, *dev_ctx, &group_tensor); + group_tensor.Resize({static_cast(length)}); + } else { + group_tensor.Resize({static_cast(length)}); operators::math::set_constant(*dev_ctx, &group_tensor, 0.0); -#endif } +#endif } } else { // process sparse group - if (is_used_var) { - auto var_warpper = vars_[var_index]->GradVarBase()->SharedVar(); - group.sparse_contents_ = var_warpper->MutableVar(); - } else { - group.sparse_contents_ = nullptr; - } + PADDLE_ENFORCE_EQ( + HasGrad(var_index), true, + platform::errors::PreconditionNotMet( + "The sparse parameter[%d][%s] should have gradient. " + "Currently, DataParallel does not support sparse " + "parameters without generating gradients during training. " + "For example, if is_sparese=True is used in Embedding, " + "the current step of this parameter cannot generate gradient " + "because of stop_gradient/detatch, where error will occur.", + var_index, vars_[var_index]->Name())); + auto var_base = vars_[var_index]->GradVarBase(); + // need to check tensor type + PADDLE_ENFORCE_EQ( + var_base->Var().IsType(), true, + platform::errors::PreconditionNotMet( + "The sparse parameter[%d][%s] must have a selectedrows gradient. " + "Before forward pass, the parameter type is inferred to be " + "SelectedRows, but after backward pass, its actual type becomes " + "LodTensor. It is currently not supported by DataParallel. " + "For example, if sparse embedding is used, and the weight of " + "embedding is shared with subsequent dense parameters, then " + "the parameter gradient of the embedding will be converted " + "to dense parameters.", + var_index, vars_[var_index]->Name())); + + group.sparse_contents_ = var_base->MutableVar(); } if (--group.pending_ == 0) { @@ -641,6 +743,14 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) { // TODO(liuyuhui): If BKCL support non-blocking communication, it should be // fixed as same as multi gpus card trainging. void Reducer::MarkGroupReady(size_t group_index) { + PADDLE_ENFORCE_GE( + group_index, next_group_, + platform::errors::PreconditionNotMet( + "The index of the incoming group must be greater " + "than or equal to the previously synchronized group index, " + "expect it to greater than or equal to %d, but got %d.", + next_group_, group_index)); + if (group_index > next_group_) { VLOG(3) << "It will adjust the order of group in next batch automatically"; return; @@ -649,7 +759,7 @@ void Reducer::MarkGroupReady(size_t group_index) { for (; next_group_ < groups_.size() && groups_[next_group_].pending_ == 0; ++next_group_) { auto &group = groups_[next_group_]; - int run_order = next_group_ % nrings_; + const int run_order = next_group_ % nrings_; // For CUDA or XPU, compute_stream --> comm_stream. // For CPU, do nothing. @@ -665,10 +775,11 @@ void Reducer::MarkGroupReady(size_t group_index) { // TODO(liuyuhui): Add try catch to deal with exception later, // otherwise the main thread will continue to run when an exception is // thrown in comm_pool_. - comm_pool_->enqueue([&] { + auto next_group = next_group_; + comm_pool_->enqueue([this, run_order, next_group, &group] { auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place_).device; platform::SetXPUDeviceId(dev_id); - FusedAllReduceSchedule(run_order, group); + FusedAllReduceSchedule(run_order, group, next_group); { std::lock_guard lock(mutex_); comm_op_count_ -= 1; // lock @@ -676,7 +787,7 @@ void Reducer::MarkGroupReady(size_t group_index) { } }); #elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) - FusedAllReduceSchedule(run_order, group); + FusedAllReduceSchedule(run_order, group, next_group_); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "Not compiled with BKCL or NCCL.")); @@ -684,24 +795,23 @@ void Reducer::MarkGroupReady(size_t group_index) { } } -void Reducer::FusedAllReduceSchedule(int run_order, Group &group) { +void Reducer::FusedAllReduceSchedule(const int run_order, Group &group, + const int curr_group_index) { + // The overall timeline: concat > div_nranks > allreduce > split + // dev_context is used to select different stream + const auto &dev_context = *parallel_ctx_->GetDeviceContext(run_order); if (group.is_sparse_) { - if (group.sparse_contents_ != nullptr) { - VLOG(3) << "sparse group [" << next_group_ << "] start allreduce in ring[" - << run_order << "]"; - group.DivNRanks(*parallel_ctx_->GetDeviceContext(run_order), nranks_); - parallel_ctx_->AllReduceByStream( - *group.sparse_contents_, group.sparse_contents_, run_order, false); - } else { - VLOG(3) << "The sparse group[" << next_group_ - << "] has no var to allreduce"; - } + VLOG(3) << "sparse group [" << curr_group_index + << "] start allreduce in ring[" << run_order << "]"; + group.DivNRanks(dev_context, nranks_); + parallel_ctx_->AllReduceByStream(*group.sparse_contents_, + group.sparse_contents_, run_order, false); } else { - VLOG(3) << "dense group [" << next_group_ << "] start allreduce in ring[" - << run_order << "]"; + VLOG(3) << "dense group [" << curr_group_index + << "] start allreduce in ring[" << run_order << "]"; // Select common commstream to concat tensors // group.dense_tensors ---> group.dense_contents_ - group.ConcatTensors(*parallel_ctx_->GetDeviceContext(run_order)); + group.ConcatTensors(dev_context); // NOTE(liuyuhui): ConcatTensors use communication stream, but BKCL only support // default stream for communicating, so there exist some problems in @@ -713,15 +823,15 @@ void Reducer::FusedAllReduceSchedule(int run_order, Group &group) { parallel_ctx_->WaitComm(run_order); } #endif - group.DivNRanks(*parallel_ctx_->GetDeviceContext(run_order), nranks_); + group.DivNRanks(dev_context, nranks_); // Start allreduce parallel_ctx_->AllReduceByStream( group.dense_contents_, &(group.dense_contents_), run_order, false); - // Select common commstream to split tensors + // Select communication stream to split tensors // group.dense_contents_ ---> group.dense_tensors - group.SplitTensors(*parallel_ctx_->GetDeviceContext(run_order)); + group.SplitTensors(dev_context); } } @@ -747,14 +857,98 @@ std::vector> Reducer::RebuildGruops() { return rebuild_group_indices; } +void Reducer::ProcessUnusedDenseVars() { + // The calculation stream must be used here to + // avoid conflicts with communication. + VLOG(3) << "Local used vars : " + << string::join_strings(local_used_vars_, ','); + const auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_); + // H2D is to allreduce the local_used_vars_ + auto *global_used_tensor = + global_used_vars_.GetMutable(); + framework::TensorFromVector(local_used_vars_, *dev_ctx, + global_used_tensor); + parallel_ctx_->AllReduceByStream(global_used_vars_, &global_used_vars_, 0, + true); + framework::TensorToVector(*global_used_tensor, *dev_ctx, + &local_used_vars_); + + // sync compute stream to get global used var message, + // but maybe affect speed performance + parallel_ctx_->SynchronizeCompute(); + VLOG(3) << "Global used vars : " + << string::join_strings(local_used_vars_, ','); + + for (const auto var_index : unused_vars_) { + const bool global_unused = (local_used_vars_[var_index] == 0); + + // global used but local unused, set grad + VLOG(3) << "Var [" << var_index << "] [" << vars_[var_index]->Name() + << "] global_unused:" << global_unused + << " has grad: " << HasGrad(var_index); + + if (!global_unused) { + VLOG(3) << "Start process unused Var"; + // 1. source var base + const auto &var_locator = variable_locators_[var_index]; + const auto group_index = var_locator.group_index; + const auto &group = groups_[group_index]; + const auto inside_group_index = var_locator.inside_group_index; + const auto &src_tensor = group.dense_tensors_[inside_group_index]; + // sparse no need to check and no support find_unused_parameters + if (group.is_sparse_) { + continue; + } + // 2. destination var base + auto dest_var_base = vars_[var_index]; + auto *dest_tensor = + dest_var_base->MutableVar()->GetMutable(); + const auto &dest_dims = dest_tensor->dims(); + + // 3. create grad var base or get grad var base + auto grad_var_base_tmp = dest_var_base->MutableGradVarBase(); + + // 4. set grad tensor + auto *dest_grad_tensor = + grad_var_base_tmp->MutableVar()->GetMutable(); + const auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_); + TensorCopy(src_tensor, place_, *dev_ctx, dest_grad_tensor); + dest_grad_tensor->Resize(dest_dims); + } + } +} + +bool Reducer::HasGrad(size_t var_index) { + const auto grad_var = vars_[var_index]->GradVarBase(); + if (!grad_var || !grad_var->Var().IsInitialized()) { + return false; + } + + const auto &var = grad_var->Var(); + if (var.IsType()) { + if (var.Get().IsInitialized()) { + return true; + } + } else if (var.IsType()) { + if (var.Get().value().IsInitialized()) { + return true; + } + } else { + PADDLE_THROW(platform::errors::PermissionDenied( + "Only support LoDTensor and SelectedRows for gradient var")); + } + return false; +} + void Reducer::FinalizeBackward() { - all_group_ready_ = false; + groups_need_finalize_ = false; #ifdef PADDLE_WITH_XPU_BKCL { std::unique_lock lock(mutex_); cv_.wait(lock, [&] { return comm_op_count_ == 0; }); } #endif + // Must prevent compute_stream_ starting until all comm streams have finished for (int i = 0; i < nrings_; ++i) { parallel_ctx_->WaitComm(i); @@ -767,7 +961,18 @@ void Reducer::FinalizeBackward() { InitializeGroups(group_indices_); } - VLOG(3) << "In the batch, Reducer is finished..."; + if (find_unused_vars_each_step_) { +// TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + ProcessUnusedDenseVars(); +#endif + // Initialize local used vars + local_used_vars_.clear(); + local_used_vars_.resize(vars_.size(), 0); + VLOG(3) << "ProcessUnusedDenseVars is finished."; + } + + VLOG(3) << "In the batch, Reducer is finished."; } // According to the size of each parameter, it is allocated to different groups. diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h index b2680d0dea71aa19399242bffedc3d7914cebbb9..8392ab2c704d503a622cc09cd5a7efb8ebc680b3 100644 --- a/paddle/fluid/imperative/reducer.h +++ b/paddle/fluid/imperative/reducer.h @@ -27,6 +27,7 @@ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/for_range.h" @@ -153,13 +154,23 @@ class Reducer { void MarkGroupReady(size_t group_index); - void FusedAllReduceSchedule(int run_order, Group& group); // NOLINT + void FusedAllReduceSchedule(const int run_order, Group& group, // NOLINT + const int curr_group_index); void FinalizeBackward(); std::vector> RebuildGruops(); - inline bool NeedRebuildGroup() { return !has_rebuilt_group_; } + inline bool NeedRebuildGroup() { + return !has_rebuilt_group_ && !find_unused_vars_each_step_; + } + + void ProcessUnusedDenseVars(); + + bool HasGrad(size_t var_index); + + void TraverseBackwardGraph( + const std::vector>& outputs); private: std::vector> vars_; @@ -187,8 +198,9 @@ class Reducer { std::unordered_map var_index_map_; std::vector unused_vars_; bool has_marked_unused_vars_{false}; - bool find_unused_vars_{false}; - bool all_group_ready_{false}; + bool find_unused_vars_each_step_{false}; + bool find_unused_vars_once_{true}; + bool groups_need_finalize_{false}; #ifdef PADDLE_WITH_XPU_BKCL // comm_pool_ is used for scheduling allreduce in multi Kunlun cards training. std::unique_ptr<::ThreadPool> comm_pool_{nullptr}; @@ -196,6 +208,19 @@ class Reducer { std::mutex mutex_; std::condition_variable cv_; #endif + + // it just for checking hook, each parameter can only trigger one hook + std::vector vars_marked_ready_; + + // Following variables are to help control flow. + // local_used_vars_ uses 0/1 to indicate whether the + // var is used in iteration. After the end of the + // iteration, global_used_vars_ is obtained synchronously + // globally. Choose whether to update the local + // gradient according to the global_used_vars_. + std::vector local_used_vars_; + // global_used_vars_ is used in comm stream to avoid wait + framework::Variable global_used_vars_; }; std::vector> AssignGroupBySize( diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc index 4967df5341d3559aa9a8d6c57e8d12ba808396e0..2d8a08217b0b83cfc22c250551e9aa81e01e86c0 100644 --- a/paddle/fluid/imperative/tests/nccl_context_test.cc +++ b/paddle/fluid/imperative/tests/nccl_context_test.cc @@ -15,6 +15,7 @@ #include // NOLINT #include "paddle/fluid/imperative/nccl_context.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" #include "gtest/gtest.h" @@ -36,9 +37,13 @@ imperative::ParallelStrategy GetStrategy(int local_rank) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) void BcastNCCLId(int local_rank, std::vector* nccl_ids) { auto strategy = GetStrategy(local_rank); + int server_fd = platform::CreateListenSocket(strategy.current_endpoint_); + platform::CUDAPlace gpu(local_rank); imperative::NCCLParallelContext ctx(strategy, gpu); - ctx.BcastNCCLId(*nccl_ids, 0); + ctx.BcastNCCLId(*nccl_ids, 0, server_fd); + + platform::CloseSocket(server_fd); } TEST(BcastNCCLId, Run) { diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc index 7bf5f876681bab001665080f506b087455dfde4b..5c4e1538cf053853d2e9d5dab88419d930b06b63 100644 --- a/paddle/fluid/imperative/tests/test_hooks.cc +++ b/paddle/fluid/imperative/tests/test_hooks.cc @@ -37,6 +37,30 @@ namespace imperative { using vb_vector = std::vector>; using var_pair = std::pair; +std::shared_ptr DoubleHook( + const std::shared_ptr& var) { + // 1. create out var + auto out_var = std::make_shared(var->Name()); + out_var->SetType(var->Type()); + out_var->SetDataType(var->DataType()); + out_var->SetForwardDataType(var->ForwardDataType()); + out_var->InnerSetOverridedStopGradient(var->InnerOverridedStopGradient()); + + // 2. get input and output var's tensor + auto* out_tensor = out_var->MutableVar()->GetMutable(); + auto& tensor = var->Var().Get(); + out_tensor->Resize(tensor.dims()); + + // 3. double calc + auto* data = tensor.data(); + auto* out_data = out_tensor->mutable_data(platform::CPUPlace()); + for (int64_t i = 0; i < out_tensor->numel(); ++i) { + out_data[i] = data[i] * 2.0; + } + + return out_var; +} + TEST(TestHooks, TestGradVarLeafBackwardHook) { // 1. prepare Tracer tracer; @@ -73,17 +97,14 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) { framework::AttributeMap mul_attr_map; mul_attr_map["use_mkldnn"] = false; - // add GradAccumulatorPostHook - auto x_var_wrapper = x->SharedVar(); - x_var_wrapper->AddGradVarLeafBackwardHook( - std::unique_ptr( - new LambdaGradAccumulatorPostHook([=](VariableWrapper* grad) { - auto* grad_tensor = - grad->MutableVar()->GetMutable(); - for (int i = 0; i < grad_tensor->numel(); ++i) { - grad_tensor->mutable_data(place)[i] *= 2.0; - } - }))); + // add VariableWrapper hook + x->GradVarBase()->AddVariableWrapperHook( + std::make_shared(DoubleHook)); + + // add Void hook + int64_t hook_value = 0; + x->GradVarBase()->AddVoidHook( + std::make_shared>([&]() { hook_value = 10; })); // 2. forward tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true); @@ -93,16 +114,21 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) { ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL); // 3. backward + std::vector> tensors{out}; + std::vector> grad_tensors{nullptr}; BasicEngine engine; - engine.Init(out.get()); + engine.Init(tensors, grad_tensors); engine.Execute(); + // verify VariableWrapper hook result framework::LoDTensor x_grad; framework::TensorCopySync(x->GradVar().Get(), place, &x_grad); for (int i = 0; i < x_grad.numel(); ++i) { ASSERT_EQ(x_grad.data()[i], 8.0); } + // verify Void hook result + ASSERT_EQ(hook_value, 10); framework::LoDTensor y_grad; framework::TensorCopySync(y->GradVar().Get(), place, @@ -151,17 +177,14 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() { memory::Copy(place, mutable_z, place, src_data.data(), sizeof(float) * src_data.size()); - // add GradAccumulatorPostHook - auto x_var_wrapper = x->SharedVar(); - x_var_wrapper->AddGradVarLeafBackwardHook( - std::unique_ptr( - new LambdaGradAccumulatorPostHook([=](VariableWrapper* grad) { - auto* grad_tensor = - grad->MutableVar()->GetMutable(); - for (int i = 0; i < grad_tensor->numel(); ++i) { - grad_tensor->mutable_data(place)[i] *= 2.0; - } - }))); + // add VariableWrapper hook + x->GradVarBase()->AddVariableWrapperHook( + std::make_shared(DoubleHook)); + + // add Void hook + int64_t hook_value = 0; + x->GradVarBase()->AddVoidHook( + std::make_shared>([&]() { hook_value = 100; })); // 2. forward var_pair x_pair = var_pair("X", vb_vector(1, x)); @@ -193,16 +216,21 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() { ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL); // 3. backward + std::vector> tensors{out}; + std::vector> grad_tensors{nullptr}; BasicEngine engine; - engine.Init(out.get()); + engine.Init(tensors, grad_tensors); engine.Execute(); + // verify VariableWrapper hook result framework::LoDTensor x_grad; framework::TensorCopySync(x->GradVar().Get(), place, &x_grad); for (int i = 0; i < x_grad.numel(); ++i) { ASSERT_EQ(x_grad.data()[i], 16.0); } + // verify Void hook result + ASSERT_EQ(hook_value, 100); framework::LoDTensor y_grad; framework::TensorCopySync(y->GradVar().Get(), place, diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc index 9e3b0ea5df6838477cac579df164bfddf31a176c..76de413b3e6033bf1e6027bbd3bbc210d8a405df 100644 --- a/paddle/fluid/imperative/tests/test_tracer.cc +++ b/paddle/fluid/imperative/tests/test_tracer.cc @@ -250,7 +250,10 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) { tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map, gpu_place, true); imperative::BasicEngine engine; - engine.Init(reduce_sum_out.get()); + + std::vector> tensors{reduce_sum_out}; + std::vector> grad_tensors{nullptr}; + engine.Init(tensors, grad_tensors); engine.Execute(); framework::LoDTensor rlt; @@ -376,8 +379,10 @@ TEST(test_tracer, test_var_without_grad_var) { ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL); ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL); + std::vector> tensors{vout}; + std::vector> grad_tensors{nullptr}; imperative::BasicEngine engine; - engine.Init(vout.get()); + engine.Init(tensors, grad_tensors); engine.Execute(); // check the grad diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index c9567a98a73bf213c70bbfbd2c9a39b5efb38d2e..0c2fd2da7e0ba6ed13ad993f30ed8650e6fc15a2 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -19,6 +19,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/imperative/amp_auto_cast.h" #include "paddle/fluid/imperative/op_base.h" +#include "paddle/fluid/platform/denormal.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/string_helper.h" @@ -38,7 +39,7 @@ void SetCurrentTracer(const std::shared_ptr& tracer) { VLOG(6) << "Set current tracer: " << g_current_tracer; } -static void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad) { +void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad) { for (const auto& pair : outs) { for (const auto& var : pair.second) { // NOTE(zhiqiu): this happends when None output are passed from python @@ -83,7 +84,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( if (gcs_.count(place) == 0) { std::unique_ptr gc; if (platform::is_gpu_place(place)) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gc.reset(new framework::DefaultStreamGarbageCollector( BOOST_GET_CONST(platform::CUDAPlace, place), 0)); @@ -94,7 +95,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( "Please recompile or reinstall Paddle with GPU support.")); #endif } else if (platform::is_cuda_pinned_place(place)) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gc.reset(new framework::CUDAPinnedGarbageCollector( BOOST_GET_CONST(platform::CUDAPinnedPlace, place), 0)); @@ -134,6 +135,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, const platform::Place& place, bool trace_backward, const std::map& inplace_map) { platform::RecordEvent op_type_record_event(type); + platform::ScopedFlushDenormal flush; VLOG(1) << "Trace Op: " << type; if (FLAGS_use_mkldnn) { // if both lists are empty all ops are enabled (default for diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index b10d1b2d0b49dac42f80e9462fb35ee58312b8b0..8f50550878262f1d37c34923e4c8bc55460b08d6 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -130,5 +130,7 @@ void IncreaseVarbaseReferenceCountUntilCopyComplete( const std::shared_ptr& var, const platform::Place& place); +void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad); + } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h index b42f25dcc880015b9e191970f84e970f329bb0ee..5fa8b89a396d9bd509de375471b8d383b9b91874 100644 --- a/paddle/fluid/imperative/variable_wrapper.h +++ b/paddle/fluid/imperative/variable_wrapper.h @@ -27,8 +27,8 @@ namespace paddle { namespace imperative { -class InteriorVarHookPipeline; -class LeafVarHookPipeline; +class VariableWrapperHook; +class InplaceVariableWrapperHook; class VarBase; class GradOpNode; @@ -38,6 +38,9 @@ class VariableWrapper { explicit VariableWrapper(const std::string& name) : name_(name) {} + VariableWrapper(const std::string& name, const framework::Variable& variable) + : var_(variable), name_(name) {} + ~VariableWrapper() { VLOG(10) << "Destruct VariableWrapper: " << Name(); } const framework::Variable& Var() const { return var_; } @@ -193,42 +196,6 @@ class VariableWrapper { } } - /* Hook related method: only can be call by GradVarBase */ - - bool HasInteriorHooks() const { return interior_hooks_ != nullptr; } - - bool HasLeafHooks() const { return leaf_hooks_ != nullptr; } - - void AddGradVarInteriorHook(std::unique_ptr&& hook) { - auto interior_hooks = GetGradVarInteriorHooksSafely(); - interior_hooks->add_hook(std::move(hook)); - } - - void AddGradVarLeafHook(std::unique_ptr&& hook) { - auto leaf_hooks = GetGradVarLeafHooksSafely(); - leaf_hooks->add_hook(std::move(hook)); - } - - void AddGradVarLeafBackwardHook( - std::unique_ptr&& hook) { - auto leaf_hooks = GetGradVarLeafHooksSafely(); - leaf_hooks->add_backward_hook(std::move(hook)); - } - - const std::shared_ptr& GetInteriorHooks() const { - return interior_hooks_; - } - - std::shared_ptr& GetInteriorHooks() { - return interior_hooks_; - } - - const std::shared_ptr& GetLeafHooks() const { - return leaf_hooks_; - } - - std::shared_ptr& GetLeafHooks() { return leaf_hooks_; } - uint32_t InplaceVersionSnapshot() const { return inplace_version_snapshot_; } void ResetInplaceVersion() { @@ -255,6 +222,38 @@ class VariableWrapper { return; } + /* Hook related methods */ + bool HasVariableWrapperHook() const { return !var_hooks_.empty(); } + + int64_t AddVariableWrapperHook(std::shared_ptr&& hook) { + var_hooks_.emplace(next_hook_id_, std::move(hook)); + return next_hook_id_++; + } + + bool RemoveVariableWrapperHook(const int64_t& hook_id) { + auto remove_cnt = var_hooks_.erase(hook_id); + if (remove_cnt == 0) { + return false; + } + return true; + } + + const std::map>& + GetVariableWrapperHooks() const { + return var_hooks_; + } + + bool HasVoidHook() const { return !void_hooks_.empty(); } + + void AddVoidHook(std::shared_ptr>&& hook) { + void_hooks_.emplace_back(std::move(hook)); + } + + const std::vector>>& GetVoidHooks() + const { + return void_hooks_; + } + private: void SetGradVar(const std::shared_ptr& var) { auto shared_var = grad_var_.lock(); @@ -289,41 +288,6 @@ class VariableWrapper { } } - /* Hook related private methods */ - std::shared_ptr GetGradVarSafely() const { - auto shared_grad_var = grad_var_.lock(); - PADDLE_ENFORCE_NOT_NULL( - shared_grad_var, - platform::errors::PermissionDenied( - "Cannot add gradient hook on Tensor without gradient.")); - return shared_grad_var; - } - - std::shared_ptr& GetGradVarInteriorHooksSafely() { - auto shared_grad_var = GetGradVarSafely(); - PADDLE_ENFORCE_EQ(HasGradNode(), true, - platform::errors::PermissionDenied( - "Only interior Tensor in backward can register " - "interior gradient hook.")); - if (shared_grad_var->interior_hooks_ == nullptr) { - shared_grad_var->interior_hooks_ = - std::make_shared(); - } - return shared_grad_var->interior_hooks_; - } - - std::shared_ptr& GetGradVarLeafHooksSafely() { - auto shared_grad_var = GetGradVarSafely(); - PADDLE_ENFORCE_EQ( - HasGradNode(), false, - platform::errors::PermissionDenied( - "Only leaf Tensor in backward can register leaf gradient hook.")); - if (shared_grad_var->leaf_hooks_ == nullptr) { - shared_grad_var->leaf_hooks_ = std::make_shared(); - } - return shared_grad_var->leaf_hooks_; - } - private: framework::Variable var_; std::string name_; @@ -358,11 +322,19 @@ class VariableWrapper { // isn't need bool is_empty_{false}; - // NOTE: only grad var can hold hooks now - // only interior var can hold interior hooks - std::shared_ptr interior_hooks_; - // only leaf var can hold leaf hooks - std::shared_ptr leaf_hooks_; + // NOTE(chenweihang): only grad var will hold hooks now + int64_t next_hook_id_{0}; + // [ Hooks with VariableWrapper as input and output ] + // NOTE: Now registered for grad var, support adding and removing, + // key is the accumulated int64_t value + // NOTE: Var hook need to support removing, so need hook id + std::map> var_hooks_; + // [ Hooks without input and output ] + // NOTE: Now registered after the execution of the entire backward + // process is over, currently only used for reducing in distributed + // training + // NOTE: Now no need to support remove void hook + std::vector>> void_hooks_; }; } // namespace imperative diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 7a8bfc1a8c7008f5b16a2fca6692600b39690e59..c002c7a10cb7b3e953a4e2551e54b20998dea400 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -33,9 +33,13 @@ if (WITH_LITE) add_subdirectory(lite) endif() -# fluid_modules exclude API-interface of inference/api and inference/capi +# fluid_modules exclude API-interface of inference/api and inference/capi_exp get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) +# Adapt to custom op mechanism: Include the header files related to the data type +# to avoid exposing the path of the underlying file +include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform) + add_subdirectory(api) # Create static inference library if needed @@ -57,7 +61,7 @@ if(NOT APPLE) endif() # C inference API -add_subdirectory(capi) +add_subdirectory(capi_exp) if(WITH_TESTING AND WITH_INFERENCE_API_TEST) add_subdirectory(tests/api) diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h index d5a972fab3beae4d4e2e512d1ccda3f0b8356682..14a1c3eea3417432c76ce03d41b558577d2aa037 100644 --- a/paddle/fluid/inference/analysis/analysis_pass.h +++ b/paddle/fluid/inference/analysis/analysis_pass.h @@ -18,7 +18,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/inference/analysis/argument.h" #include "paddle/fluid/inference/analysis/helper.h" diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index bd27b1f5f34475db793d643f2d12508e0aea631e..255c6ca75dfd74b3cf5984661ea931d36295f72a 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -213,6 +213,11 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool); DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool); + DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool); + DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int); + DECL_ARGUMENT_FIELD(dlnne_max_batch_size, DlnneMaxBatchSize, int); + DECL_ARGUMENT_FIELD(dlnne_workspace_size, DlnneWorkspaceSize, int); + DECL_ARGUMENT_FIELD(lite_passes_filter, LitePassesFilter, std::vector); DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector); @@ -222,6 +227,11 @@ struct Argument { DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool); DECL_ARGUMENT_FIELD(xpu_l3_workspace_size, XpuL3WorkspaceSize, int); + DECL_ARGUMENT_FIELD(xpu_locked, XpuLocked, bool); + DECL_ARGUMENT_FIELD(xpu_autotune, XpuAutotune, bool); + DECL_ARGUMENT_FIELD(xpu_autotune_file, XpuAutotuneFile, std::string); + DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string); + DECL_ARGUMENT_FIELD(xpu_adaptive_seqlen, XpuAdaptiveSeqlen, bool); // Memory optimized related. DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc index 368ef2e5583fe2f6fcb24c98ded02f4e5325f7a4..ede0402f816765f7079eab91170e2d9d5905e915 100644 --- a/paddle/fluid/inference/analysis/helper.cc +++ b/paddle/fluid/inference/analysis/helper.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/helper.h" -#include "paddle/fluid/framework/framework.pb.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index ab4949935140c682c7dee355a0e4ed9c2b3a1f5a..cace420d87c9df54387c27cecc58705c19ce5336 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -25,7 +25,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index a4e263e2f464c4021b049093c49ddaecb056284f..4bb08dc96b1cf529c1b433092f3b9e51d03aa7e9 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -106,8 +106,8 @@ void IRPassManager::CreatePasses(Argument *argument, bool use_static_engine = argument->tensorrt_use_static_engine(); bool model_from_memory = argument->model_from_memory(); std::string optim_cache_dir = argument->optim_cache_dir(); - bool int8_valid = - !(model_from_memory && optim_cache_dir.empty() && enable_int8); + bool int8_valid = !(model_from_memory && optim_cache_dir.empty() && + enable_int8 && use_calib_mode); PADDLE_ENFORCE_EQ( int8_valid, true, platform::errors::PreconditionNotMet( @@ -166,6 +166,11 @@ void IRPassManager::CreatePasses(Argument *argument, // run fp16. pass->Set("disable_trt_plugin_fp16", new bool(argument->disable_trt_plugin_fp16())); + } else if (pass_name == "dlnne_subgraph_pass") { + pass->Set("min_subgraph_size", + new int(argument->dlnne_min_subgraph_size())); + pass->Set("program", + new framework::ProgramDesc *(&argument->main_program())); } if (pass_name == "lite_subgraph_pass") { bool enable_int8 = @@ -183,6 +188,12 @@ void IRPassManager::CreatePasses(Argument *argument, new int(argument->xpu_l3_workspace_size())); pass->Set("cpu_math_library_num_threads", new int(argument->cpu_math_library_num_threads())); + pass->Set("locked", new bool(argument->xpu_locked())); + pass->Set("autotune", new bool(argument->xpu_autotune())); + pass->Set("autotune_file", + new std::string(argument->xpu_autotune_file())); + pass->Set("precision", new std::string(argument->xpu_precision())); + pass->Set("adaptive_seqlen", new bool(argument->xpu_adaptive_seqlen())); } disable_logs_ = argument->disable_logs(); if (pass_name == "fc_fuse_pass") { diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt index e35178428cc7bae7f5795e2a4652b808956f6776..330f7a99847344f7359a29e26efac71e969bf06d 100644 --- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt @@ -20,3 +20,15 @@ if (WITH_LITE) set(INFER_IR_PASSES ${INFER_IR_PASSES} lite_subgraph_pass CACHE INTERNAL "") cc_test(lite_subgraph_pass_tester SRCS lite_subgraph_pass_tester.cc DEPS lite_subgraph_pass gtest glog) endif() + +MESSAGE("WITH_DLNNE:${WITH_DLNNE}") +if(WITH_DLNNE) + cc_library(dlnne_subgraph_pass SRCS dlnne_subgraph_pass.cc DEPS ${analysis_deps} subgraph_util) + set(analysis_deps ${analysis_deps} + subgraph_util dlnne_subgraph_pass + CACHE INTERNAL "") + + set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp) + file(APPEND ${pass_file} "USE_PASS(dlnne_subgraph_pass);\n") + set(INFER_IR_PASSES ${INFER_IR_PASSES} dlnne_subgraph_pass CACHE INTERNAL "") +endif() diff --git a/paddle/fluid/operators/distributed/large_scale_kv.cc b/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h similarity index 64% rename from paddle/fluid/operators/distributed/large_scale_kv.cc rename to paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h index d2673ed6ffb3667eed2a4599ae462587c18431b0..ae977c1403a8793b0611496702515f1df952d5a1 100644 --- a/paddle/fluid/operators/distributed/large_scale_kv.cc +++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,16 +11,11 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - -#include "paddle/fluid/operators/distributed/large_scale_kv.h" +#pragma once namespace paddle { -namespace operators { -namespace distributed { - -std::once_flag LargeScaleKV::init_flag_; -std::shared_ptr LargeScaleKV::scale_kv_(nullptr); +namespace inference { -} // namespace distributed -} // namespace operators +int RegisterPyFunc(const std::string& name, void* pfn); +} // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..8f789139af9bfc35841f284d043a2c86f5803e93 --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc @@ -0,0 +1,351 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include + +#include +#include + +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/subgraph_detector.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h" +#include "paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace inference { + +int (*PyConvertGraph)(const char *graph_name); + +int RegisterPyFunc(const std::string &name, void *pfn) { + if (name.compare("convert_graph") == 0) { + PyConvertGraph = reinterpret_cast(pfn); + } + + return 0; +} +int ConvertGraph(std::string graph_name) { + LOG(INFO) << "starting doing convert_graph"; + + PyConvertGraph(graph_name.c_str()); + + return 0; +} + +namespace analysis { + +using framework::ir::Node; + +void analysis::DlnneSubgraphPass::ApplyImpl(framework::ir::Graph *graph) const { + static std::unordered_set teller_set{ + "mul", "matmul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", + "hard_swish", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", + "elementwise_add", "elementwise_mul", "dropout", "prelu", + "conv2d_transpose", "leaky_relu", + // "fc", + "shuffle_channel", "swish", "split", + // "instance_norm", + "gelu", + // "layer_norm", + // "scale", + // "stack", + "relu6", "reshape2", "transpose2", "concat", "slice", + }; + + framework::ir::FusePassBase::Init("dlnne_subgraph_pass", graph); + + auto teller = [&](const framework::ir::Node *node) { + if (!node->IsOp() || !node->Op()) return false; + return teller_set.find(node->Op()->Type()) != teller_set.end(); + }; + + framework::ir::SubGraphFuser fuser( + graph, teller, Get("min_subgraph_size") /*min subgraph size*/, + "dlnne_engine"); + fuser(); + + std::vector graph_param_names = + ExtractParameters(graph->Nodes()); + // those parameter already exist in dlnne, and should not have another copy in + // fluid. + std::vector repetitive_params; + + for (auto *node : graph->Nodes()) { + if (node->IsOp() && !framework::ir::Agent(node).subgraph()->empty()) { + CreateDlnneOp(node, graph, graph_param_names, &repetitive_params); + + std::unordered_set nodes2remove( + framework::ir::Agent(node).subgraph()->begin(), + framework::ir::Agent(node).subgraph()->end()); + framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); + } + } + + std::unordered_set nodes2remove; + for (auto *node : graph->Nodes()) { + if (node->IsOp() && framework::ir::Agent(node).deleted()) { + nodes2remove.insert(node); + } + } + framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); +} + +std::string GenerateEngineKey(const std::set &engine_inputs, + const std::set &engine_outputs, + const std::string &predictor_id) { + std::string engine_hash_key = ""; + for (auto name : engine_inputs) { + engine_hash_key += name; + } + for (auto name : engine_outputs) { + engine_hash_key += name; + } + engine_hash_key += predictor_id; + auto engine_key = std::to_string(std::hash()(engine_hash_key)); + return engine_key; +} +std::string replace_name(std::string name, const char *raw, + const char *new_char) { + std::string r_name = name; + int pos = r_name.find(raw); + while (pos >= 0) { + r_name = r_name.replace(pos, 1, new_char); + pos = r_name.find(raw); + } + return r_name; +} + +void DlnneSubgraphPass::CreateDlnneOp( + framework::ir::Node *node, framework::ir::Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const { + auto *op_desc = node->Op(); + auto &subgraph = *framework::ir::Agent(node).subgraph(); + PADDLE_ENFORCE_EQ(subgraph.empty(), false, + platform::errors::PreconditionNotMet( + "The subgraph should not be empty.")); + + // A fake block desc. + framework::proto::BlockDesc block_proto; + framework::BlockDesc block_desc(nullptr, &block_proto); + block_desc.Proto()->set_parent_idx(-1); + block_desc.Proto()->set_idx(0); + LOG(INFO) << "--- detect a sub-graph with " << subgraph.size() << " nodes"; + // for debug + framework::ProgramDesc tmp_dump_program_desc; + auto *tmp_dump_main_block = tmp_dump_program_desc.MutableBlock(0); + + std::unordered_map name_var_desc; + std::set name_var_input_nodes; + std::set name_var_output_nodes; + std::set name_ops; + + for (auto *node : subgraph) { + auto *op = block_desc.AppendOp(); + *op->Proto() = *node->Op()->Proto(); + + // debug + { + name_ops.insert(node->Name()); + auto *tmp_dump_new_block_op = tmp_dump_main_block->AppendOp(); + + framework::OpDesc op_desc; + op_desc.CopyFrom(*node->Op()); + + for (auto argument_name : op_desc.InputArgumentNames()) { + if (std::count(graph_params.begin(), graph_params.end(), + argument_name) > 0) { + op_desc.Rename(argument_name, replace_name(argument_name, "/", ".")); + } + } + for (auto argument_name : op_desc.OutputArgumentNames()) { + if (std::count(graph_params.begin(), graph_params.end(), + argument_name) > 0) { + op_desc.Rename(argument_name, replace_name(argument_name, "/", ".")); + } + } + *tmp_dump_new_block_op->Proto() = *op_desc.Proto(); + + for (auto *x : node->inputs) { + if (x->IsVar()) { + name_var_desc[x->Name()] = x->Var(); + } + if (std::count(graph_params.begin(), graph_params.end(), x->Name()) == + 0) + name_var_input_nodes.insert(x->Name()); + } + + for (auto *x : node->outputs) { + if (x->IsVar()) { + name_var_desc[x->Name()] = x->Var(); + } + if (std::count(graph_params.begin(), graph_params.end(), x->Name()) == + 0) + name_var_output_nodes.insert(x->Name()); + } + } + } + std::set valid_input_names; + std::set valid_output_names; + for (auto name : name_var_output_nodes) { + if (name_var_input_nodes.find(name) == name_var_input_nodes.end()) { + valid_output_names.insert(name); + } + } + + for (auto name : name_var_input_nodes) { + if (name_var_output_nodes.find(name) == name_var_output_nodes.end()) { + valid_input_names.insert(name); + } + } + + // Then, we will use the input_names_with_id and output_names_with_id to + // generate the engine key. + // So, We use set instead of unordered_set here to ensure that the engine key + // is unique. + std::set input_names; + std::set input_names_with_id; + std::vector params; + // if we delete fluid copy of params shared by more than 1 ops, there will be + // problem, so we filter them out. + + // The node->inputs contains input tensors and parameters. + for (auto *x : node->inputs) { + input_names.insert(x->Name()); + input_names_with_id.insert(x->Name() + std::to_string(x->id())); + if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) { + params.push_back(x->Name()); + } + } + + std::set output_names; + std::set output_names_with_id; + std::vector origin_output_dims; + for (auto *x : node->outputs) { + origin_output_dims.push_back(x->Var()->GetShape().size()); + output_names.insert(x->Name()); + output_names_with_id.insert(x->Name() + std::to_string(x->id())); + } + + std::unordered_map output_name_map; + std::unordered_map graph_var_map; + + for (framework::ir::Node *node : graph->Nodes()) { + if (node->IsVar() && node->Var()) { + graph_var_map[node->Name()] = node; + } + } + + // Set attrs + op_desc->SetType("dlnne_engine"); + op_desc->SetInput("Xs", std::vector(valid_input_names.begin(), + valid_input_names.end())); + + op_desc->SetOutput("Ys", std::vector(valid_output_names.begin(), + valid_output_names.end())); + + op_desc->SetAttr("parameters", params); + auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id, + std::to_string(0)); + op_desc->SetAttr("engine_key", engine_key); + auto *scope = param_scope(); + + { + std::set input_names; + + for (auto name : name_var_input_nodes) { + if (name_var_output_nodes.find(name) == name_var_output_nodes.end()) { + input_names.insert(name); + } + } + + // add feed to subgraph: + int input_idx = 0; + for (auto input_name : input_names) { + auto *feed0 = tmp_dump_main_block->AppendOp(); + feed0->SetType("feed"); + feed0->SetInput("X", {"feed"}); + feed0->SetOutput("Out", {input_name}); + feed0->SetAttr("col", input_idx); + input_idx++; + } + // add fetch to subgraph: + int output_idx = 0; + for (auto output_name : valid_output_names) { + auto *fetch0 = tmp_dump_main_block->AppendOp(); + fetch0->SetType("fetch"); + fetch0->SetInput("X", {output_name}); + fetch0->SetOutput("Out", {"out"}); + fetch0->SetAttr("col", output_idx); + output_idx++; + } + + mkdir("./dump", 0777); + std::string dir_name = "./dump/" + engine_key; + mkdir(dir_name.c_str(), 0777); + ofstream m_stream; + m_stream.open(dir_name + "/__model__", ios::out); + + VLOG(4) << "name_var_desc size:" << name_var_desc.size(); + + for (auto &kv : name_var_desc) { + auto *new_add_var = tmp_dump_main_block->Proto()->add_vars(); + *new_add_var = *kv.second->Proto(); + auto *variable_tmp = scope->FindVar(kv.first); + if (variable_tmp != nullptr) { + *new_add_var->mutable_name() = replace_name(kv.first, "/", "."); + new_add_var->set_persistable(true); + } else { + new_add_var->set_persistable(false); + } + } + + for (auto param_name : params) { + auto *var = scope->FindVar(param_name); + if (var != nullptr) { + auto *var_t = var->GetMutable(); + ofstream p_stream; + p_stream.open(dir_name + "/" + replace_name(param_name, "/", "."), + ios::out); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(var_t->place()); + framework::SerializeToStream(p_stream, *var_t, dev_ctx); + p_stream.close(); + } + } + + std::string model; + + tmp_dump_program_desc.Proto()->SerializeToString(&model); + m_stream << model; + m_stream.close(); + + op_desc->SetBlockAttr("sub_block", tmp_dump_main_block); + op_desc->SetAttr("subgraph", model); + op_desc->Flush(); + + ConvertGraph(engine_key); + } +} + +} // namespace analysis +} // namespace inference +} // namespace paddle + +REGISTER_PASS(dlnne_subgraph_pass, + paddle::inference::analysis::DlnneSubgraphPass); diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..5a1d2506fdb09b5ecb63f8f922490eb4c8c01e2d --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h @@ -0,0 +1,55 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h" +#include "paddle/fluid/inference/api/paddle_analysis_config.h" + +namespace paddle { +namespace framework { +namespace ir { +class Graph; +class Node; +} // namespace ir +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { + +int ConvertGraph(std::string graph_name); + +namespace analysis { + +class DlnneSubgraphPass : public framework::ir::FusePassBase { + public: + void ApplyImpl(framework::ir::Graph *graph) const override; + + private: + void CleanIntermediateOutputs(framework::ir::Node *node); + void CreateDlnneOp(framework::ir::Node *x, framework::ir::Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc index c697914904b3e949050d6e61a7edb521ad6dd0e5..b8cac8992f4eed36b653b08febe48630c3977652 100644 --- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc @@ -245,6 +245,11 @@ void LiteSubgraphPass::SetUpEngine( bool use_xpu = Get("use_xpu"); int xpu_l3_workspace_size = Get("xpu_l3_workspace_size"); int cpu_math_library_num_threads = Get("cpu_math_library_num_threads"); + bool locked = Get("locked"); + bool autotune = Get("autotune"); + std::string autotune_file = Get("autotune_file"); + std::string precision = Get("precision"); + bool adaptive_seqlen = Get("adaptive_seqlen"); lite_api::TargetType target_type; if (use_gpu) { @@ -282,6 +287,11 @@ void LiteSubgraphPass::SetUpEngine( }; config.cpu_math_library_num_threads = cpu_math_library_num_threads; config.xpu_l3_workspace_size = xpu_l3_workspace_size; + config.locked = locked; + config.autotune = autotune; + config.autotune_file = autotune_file; + config.precision = precision; + config.adaptive_seqlen = adaptive_seqlen; if (dump_model) { lite::StrToBinaryFile("./model.bin", config.model); lite::StrToBinaryFile("./param.bin", config.param); diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 8a14e168ca4f7ebce3f7fb619655121da38c7581..f57f07883dcd701470457a3e32e14b1ae0493ea3 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -20,6 +20,7 @@ #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/op_teller.h" namespace paddle { @@ -86,7 +87,7 @@ std::string GenerateEngineKey(const std::set &engine_inputs, const std::string &predictor_id, const std::string &max_batch_size, const std::string &precision, - const std::string &use_calib_mode) { + const bool for_calibration) { std::string engine_hash_key = ""; for (auto name : engine_inputs) { engine_hash_key += name; @@ -97,12 +98,13 @@ std::string GenerateEngineKey(const std::set &engine_inputs, engine_hash_key += "#"; } engine_hash_key += predictor_id; - engine_hash_key += "#"; - engine_hash_key += max_batch_size; + if (!for_calibration) { + engine_hash_key += "#"; + engine_hash_key += max_batch_size; + } engine_hash_key += "#"; engine_hash_key += precision; - engine_hash_key += "#"; - engine_hash_key += use_calib_mode; + auto engine_key = std::to_string(std::hash()(engine_hash_key)); VLOG(2) << "TRT engine hash key: " << engine_hash_key; VLOG(2) << "TRT engine key: " << engine_key; @@ -167,11 +169,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp( std::set output_names; std::set output_names_with_id; - std::vector origin_output_dims; + std::map origin_name_output_dims; for (auto *x : node->outputs) { output_names.insert(x->Name()); output_names_with_id.insert(x->Name() + std::to_string(x->id())); - origin_output_dims.push_back(x->Var()->GetShape().size()); + origin_name_output_dims[x->Name()] = x->Var()->GetShape().size(); } std::unordered_map output_name_map; @@ -215,11 +217,13 @@ void TensorRtSubgraphPass::CreateTensorRTOp( // output_mapping help us copy the data from the renamed ITensor // to Tensor. std::vector output_mapping; + std::vector renamed_output_dims; for (auto name : output_names) { PADDLE_ENFORCE_NE(output_name_map.count(name), 0, platform::errors::PreconditionNotMet( "The output_name_map should have %s", name)); output_mapping.push_back(output_name_map[name]); + renamed_output_dims.push_back(origin_name_output_dims[name]); } PADDLE_ENFORCE_EQ(output_mapping.empty(), false, platform::errors::PreconditionNotMet( @@ -242,7 +246,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( op_desc->SetAttr("workspace_size", Get("workspace_size")); op_desc->SetAttr("gpu_id", Get("gpu_device_id")); op_desc->SetAttr("output_name_mapping", output_mapping); - op_desc->SetAttr("origin_output_dims", origin_output_dims); + op_desc->SetAttr("origin_output_dims", renamed_output_dims); op_desc->SetAttr("parameters", params); // we record all inputs' shapes in attr to check if they are consistent @@ -258,24 +262,31 @@ void TensorRtSubgraphPass::CreateTensorRTOp( // TODO(NHZlX) // There are models with the same structure but the different parameters, // when running in the 'use_serialize' mode, there is a bug. + // serialization is affected by max_batch_size, but calibration is not. + // So we use seperate engine keys in serialization and calibration. auto engine_key = GenerateEngineKey( input_names_with_id, output_names_with_id, std::to_string(0), std::to_string(Get("max_batch_size")), - std::to_string(static_cast(precision_mode)), - std::to_string(static_cast(use_calib_mode))); + std::to_string(static_cast(precision_mode)), false); + auto calibration_engine_key = GenerateEngineKey( + input_names_with_id, output_names_with_id, std::to_string(0), + std::to_string(Get("max_batch_size")), + std::to_string(static_cast(precision_mode)), true); auto predictor_id = Get("predictor_id"); // Get "" when there is no cached calibration table data. std::string calibration_data = ""; if (enable_int8 && use_calib_mode) { - calibration_data = GetTrtCalibTableData( - Get("model_opt_cache_dir"), engine_key, enable_int8); + calibration_data = + GetTrtCalibTableData(Get("model_opt_cache_dir"), + calibration_engine_key, enable_int8); } op_desc->SetAttr("calibration_data", calibration_data); op_desc->SetAttr("enable_int8", enable_int8); op_desc->SetAttr("enable_fp16", enable_fp16); op_desc->SetAttr("use_calib_mode", use_calib_mode); op_desc->SetAttr("engine_key", engine_key); + op_desc->SetAttr("calibration_engine_key", calibration_engine_key); op_desc->SetAttr("predictor_id", predictor_id); std::string trt_engine_serialized_data = ""; @@ -311,11 +322,20 @@ void TensorRtSubgraphPass::CreateTensorRTOp( opt_input_shape = {}; } - if (min_input_shape.size() > 0 && TRT_VERSION > 6000) { + auto to_major_version = [&](int full_version) -> float { + return (full_version / 100) / 10.0; + }; + const float compile_time_trt_version = to_major_version(TRT_VERSION); + const float run_time_trt_version = + to_major_version(tensorrt::GetInferLibVersion()); + if (compile_time_trt_version != run_time_trt_version) { LOG_FIRST_N(WARNING, 1) - << "The Paddle lib links the " << TRT_VERSION << " version TensorRT, " - << "make sure the runtime TensorRT you are using is no less than this " - "version, otherwise, there might be Segfault!"; + << "The Paddle Inference library is compiled with " + << compile_time_trt_version << " version TensorRT, " + << "but the runtime TensorRT you are using is " << run_time_trt_version + << " version. " + "This might cause serious compatibility issues. We strongly " + "recommend using the same TRT version at runtime."; } // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will not diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc index 5e6960c4c7e8c052fb9572f0fd6bcba24a7713b4..fdfd2c60af0c16404953e8639385e539dc13c9b3 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -103,6 +103,7 @@ void MemoryOptimizePass::CollectVarMemorySize( "merge_lod_tensor", "equal", "sequence_pool", + "recurrent", "lod_reset"}; for (auto* tmp : node->inputs) { CHECK(tmp->IsOp()); diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 9a4637306bb359a71784b9affbc0004d36d95c05..82c95ba2c95712d2ebe3aa80286689028febf3fe 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -32,10 +32,10 @@ cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) if(WITH_CRYPTO) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array - analysis_config zero_copy_tensor trainer_desc_proto paddle_crypto) + analysis_config zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator) else() cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array - analysis_config zero_copy_tensor trainer_desc_proto) + analysis_config zero_copy_tensor trainer_desc_proto custom_operator) endif() if(WIN32) @@ -57,11 +57,9 @@ if(WITH_TESTING) if (NOT APPLE AND NOT WIN32) inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_inference_shared ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR}) - set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) elseif(WIN32) inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps} ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR}) - set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) endif() endif() diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 0622fb27d9e38c87a98fcb86da64bdb21570e67d..853c1ac1da8742733e609c1dea098a208eadc015 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -26,6 +26,7 @@ namespace paddle { struct MkldnnQuantizerConfig; extern const std::vector kTRTSubgraphPasses; +extern const std::vector kDlnneSubgraphPasses; extern const std::vector kLiteSubgraphPasses; PassStrategy *AnalysisConfig::pass_builder() const { @@ -95,9 +96,17 @@ void AnalysisConfig::DisableFCPadding() { Update(); } -void AnalysisConfig::EnableXpu(int l3_workspace_size) { +void AnalysisConfig::EnableXpu(int l3_workspace_size, bool locked, + bool autotune, const std::string &autotune_file, + const std::string &precision, + bool adaptive_seqlen) { use_xpu_ = true; xpu_l3_workspace_size_ = l3_workspace_size; + xpu_locked_ = locked; + xpu_autotune_ = autotune; + xpu_autotune_file_ = autotune_file; + xpu_precision_ = precision; + xpu_adaptive_seqlen_ = adaptive_seqlen; Update(); } @@ -134,6 +143,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(trt_use_static_engine_); CP_MEMBER(trt_use_calib_mode_); CP_MEMBER(trt_use_oss_); + // Dlnne related + CP_MEMBER(use_dlnne_); + CP_MEMBER(dlnne_min_subgraph_size_); // MKLDNN related. CP_MEMBER(use_mkldnn_); CP_MEMBER(mkldnn_enabled_op_types_); @@ -157,6 +169,11 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(use_xpu_); CP_MEMBER(xpu_l3_workspace_size_); + CP_MEMBER(xpu_locked_); + CP_MEMBER(xpu_autotune_); + CP_MEMBER(xpu_autotune_file_); + CP_MEMBER(xpu_precision_); + CP_MEMBER(xpu_adaptive_seqlen_); // profile related. CP_MEMBER(with_profile_); @@ -211,6 +228,21 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { pass_builder_->DeletePass(ps); } } + if (use_dlnne_) { + auto all_passes = kDlnneSubgraphPasses; + auto other_passes = other.pass_builder()->AllPasses(); + // We should sort them, because the user may call the SwitchIrDebug + // interface, which will change the pass. + std::sort(all_passes.begin(), all_passes.end()); + std::sort(other_passes.begin(), other_passes.end()); + std::vector deleted_passes; + std::set_difference(all_passes.begin(), all_passes.end(), + other_passes.begin(), other_passes.end(), + std::inserter(deleted_passes, deleted_passes.begin())); + for (auto ps : deleted_passes) { + pass_builder_->DeletePass(ps); + } + } } void AnalysisConfig::EnableCUDNN() { @@ -309,6 +341,12 @@ void AnalysisConfig::EnableTensorRtEngine( #endif } +void AnalysisConfig::EnableDlnne(int min_subgraph_size) { + use_dlnne_ = true; + dlnne_min_subgraph_size_ = min_subgraph_size; + Update(); +} + void AnalysisConfig::SetTRTDynamicShapeInfo( std::map> min_input_shape, std::map> max_input_shape, @@ -383,6 +421,14 @@ void AnalysisConfig::Update() { pass_builder()->AppendPass(pass); } } + LOG(INFO) << "use_dlnne_:" << use_dlnne_ << std::endl; + if (use_dlnne_) { + pass_builder()->ClearPasses(); + for (const auto &pass : kDlnneSubgraphPasses) { + pass_builder()->AppendPass(pass); + } + } + if (use_gpu() && use_cudnn_) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (!enable_ir_optim_) { @@ -479,6 +525,9 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << tensorrt_max_batchsize_; ss << tensorrt_min_subgraph_size_; + ss << use_dlnne_; + ss << dlnne_min_subgraph_size_; + for (auto &op : trt_disabled_ops_) ss << op.c_str(); ss << ";"; @@ -512,6 +561,11 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << use_lite_; ss << use_xpu_; ss << xpu_l3_workspace_size_; + ss << xpu_locked_; + ss << xpu_autotune_; + ss << xpu_autotune_file_; + ss << xpu_precision_; + ss << xpu_adaptive_seqlen_; ss << thread_local_stream_; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 2a1dacedca8f1b81d155841561bd5d5a16ca9344..89c8c7902bac9fd2e15a164f7e0dfd21945cf16e 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -191,22 +191,8 @@ bool AnalysisPredictor::PrepareScope( status_is_cloned_ = true; } else { paddle::framework::InitDevices(); - scope_.reset(new paddle::framework::Scope(), [](framework::Scope *scope) { - delete scope; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - for (int dev_id = 0; dev_id < paddle::platform::GetCUDADeviceCount(); - ++dev_id) { - memory::Release(platform::CUDAPlace(dev_id)); - } -#endif -#ifdef PADDLE_WITH_XPU - for (int dev_id = 0; dev_id < paddle::platform::GetXPUDeviceCount(); - ++dev_id) { - memory::Release(platform::XPUPlace(dev_id)); - } -#endif - memory::Release(platform::CPUPlace()); - }); + // TODO(wilber): we need to release memory occupied by weights. + scope_.reset(new paddle::framework::Scope()); status_is_cloned_ = false; } sub_scope_ = &scope_->NewScope(); @@ -537,6 +523,12 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_); } + if (config_.dlnne_enabled()) { + LOG(INFO) << "Dlnne subgraph is enabled"; + argument_.SetUseDlnne(true); + argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_); + } + if (config_.lite_engine_enabled()) { argument_.SetCpuMathLibraryNumThreads( config_.cpu_math_library_num_threads()); @@ -546,6 +538,11 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetLiteZeroCopy(config_.lite_zero_copy_); argument_.SetUseXpu(config_.use_xpu_); argument_.SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_); + argument_.SetXpuLocked(config_.xpu_locked_); + argument_.SetXpuAutotune(config_.xpu_autotune_); + argument_.SetXpuAutotuneFile(config_.xpu_autotune_file_); + argument_.SetXpuPrecision(config_.xpu_precision_); + argument_.SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_); LOG(INFO) << "Lite subgraph engine is enabled"; } @@ -617,7 +614,7 @@ std::unique_ptr CreatePaddlePredictor< // This function can only be executed once per process. static std::once_flag custom_operators_registered; std::call_once(custom_operators_registered, - []() { paddle::RegisterAllCustomOperator(); }); + []() { inference::RegisterAllCustomOperator(); }); if (config.use_gpu()) { static std::once_flag gflags_initialized; @@ -1017,8 +1014,8 @@ bool AnalysisPredictor::SaveTrtCalibToDisk() { auto &block = inference_program_->Block(0); for (auto &op_desc : block.AllOps()) { if (op_desc->Type() == "tensorrt_engine") { - std::string engine_name = - BOOST_GET_CONST(std::string, op_desc->GetAttr("engine_key")); + std::string engine_name = BOOST_GET_CONST( + std::string, op_desc->GetAttr("calibration_engine_key")); if (!Singleton::Global().Has(engine_name)) { LOG(ERROR) << "You should run the predictor(with trt) on the real data " "to generate calibration info"; @@ -1191,6 +1188,13 @@ USE_TRT_CONVERTER(slice); USE_TRT_CONVERTER(scale); USE_TRT_CONVERTER(stack); USE_TRT_CONVERTER(clip); +USE_TRT_CONVERTER(gather); +USE_TRT_CONVERTER(anchor_generator); +USE_TRT_CONVERTER(yolo_box); +USE_TRT_CONVERTER(roi_align); +USE_TRT_CONVERTER(affine_channel); +USE_TRT_CONVERTER(multiclass_nms); +USE_TRT_CONVERTER(nearest_interp); #endif namespace paddle_infer { diff --git a/paddle/fluid/inference/api/demo_ci/clean.sh b/paddle/fluid/inference/api/demo_ci/clean.sh index 0d9f3d2aa237acaf3bd7adb031b1f2a73c555352..c265721db577527170b1c1c1e4ac8df28de1485d 100755 --- a/paddle/fluid/inference/api/demo_ci/clean.sh +++ b/paddle/fluid/inference/api/demo_ci/clean.sh @@ -1,3 +1,17 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + set -x cd `dirname $0` rm -rf build/ data/ diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index e11a5b9c3372a746a9ea57811fa07ee8b5c96018..53f925966662667571ef39a5d51dc4536479c295 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -88,7 +88,7 @@ for WITH_STATIC_LIB in ON OFF; do return 0 fi # -----simple_on_word2vec on windows----- - cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \ + cmake .. -G "Visual Studio 15 2017" -A x64 -T host=x64 -DPADDLE_LIB=${inference_install_dir} \ -DWITH_MKL=$TURN_ON_MKL \ -DDEMO_NAME=simple_on_word2vec \ -DWITH_GPU=$TEST_GPU_CPU \ @@ -107,7 +107,7 @@ for WITH_STATIC_LIB in ON OFF; do # -----vis_demo on windows----- rm -rf * - cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \ + cmake .. -G "Visual Studio 15 2017" -A x64 -T host=x64 -DPADDLE_LIB=${inference_install_dir} \ -DWITH_MKL=$TURN_ON_MKL \ -DDEMO_NAME=vis_demo \ -DWITH_GPU=$TEST_GPU_CPU \ diff --git a/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat b/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat index 523dafa6649b9faa019edc1c1926b5fa408e03d5..d17f516fcca5e8fd3e254e31d3f30a09a717cc5b 100644 --- a/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat +++ b/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat @@ -67,7 +67,7 @@ if /i "%use_gpu%"=="Y" ( rem set_path_vs_command_prompt :set_vcvarsall_dir -SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat =======>" +SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat =======>" set tmp_var=!vcvarsall_dir! call:remove_space set vcvarsall_dir=!tmp_var! @@ -177,16 +177,16 @@ if /i "%use_mkl%"=="N" ( if /i "%gpu_inference%"=="Y" ( if "%demo_name%"=="trt_mobilenet_demo" ( - cmake .. -G "Visual Studio 14 2015 Win64" -T host=x64 -DWITH_GPU=ON ^ + cmake .. -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=ON ^ -DWITH_MKL=%use_mkl% -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=%demo_name% ^ -DPADDLE_LIB="%paddle_infernece_lib%" -DMSVC_STATIC_CRT=ON -DCUDA_LIB="%cuda_lib_dir%" -DUSE_TENSORRT=ON ) else ( - cmake .. -G "Visual Studio 14 2015 Win64" -T host=x64 -DWITH_GPU=ON ^ + cmake .. -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=ON ^ -DWITH_MKL=%use_mkl% -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=%demo_name% ^ -DPADDLE_LIB="%paddle_infernece_lib%" -DMSVC_STATIC_CRT=ON -DCUDA_LIB="%cuda_lib_dir%" ) ) else ( - cmake .. -G "Visual Studio 14 2015 Win64" -T host=x64 -DWITH_GPU=OFF ^ + cmake .. -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=OFF ^ -DWITH_MKL=%use_mkl% -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=%demo_name% ^ -DPADDLE_LIB="%paddle_infernece_lib%" -DMSVC_STATIC_CRT=ON ) diff --git a/paddle/fluid/inference/api/demo_ci/windows_inference.md b/paddle/fluid/inference/api/demo_ci/windows_inference.md index 73938cb995f17a0ccd8df7effce18c4f81c03916..c646c351462d460d2a3f8c236cf4f870f1d4ad30 100644 --- a/paddle/fluid/inference/api/demo_ci/windows_inference.md +++ b/paddle/fluid/inference/api/demo_ci/windows_inference.md @@ -8,7 +8,7 @@ 3. 进入Paddle/paddle/fluid/inference/api/demo_ci目录,新建build目录,然后使用cmake生成vs2015的solution文件。 其中PADDLE_LIB是前面的paddle_inference.lib对应文件夹, CUDA_LIB指定为x64格式下的cuda系统库目录文件夹。 ```shell - cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_GPU=ON -DWITH_MKL=OFF -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=inference_icnet -DPADDLE_LIB=D:\to_the_paddle_inference.lib -DCUDA_LIB=D:\tools\v8.0\lib\x64 + cmake .. -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=ON -DWITH_MKL=OFF -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=inference_icnet -DPADDLE_LIB=D:\to_the_paddle_inference.lib -DCUDA_LIB=D:\tools\v8.0\lib\x64 ``` 然后用vs2015打开对应的项目文件,注意使用静态链接 "/MT",生成对应的exe。将openblas.dll放到exe所在目录。 diff --git a/paddle/fluid/inference/api/helper.cc b/paddle/fluid/inference/api/helper.cc index 9cc491e10d691a206dd903b78c0ea570741da44c..d78560239de50eb224641583d62b55bac75be465 100644 --- a/paddle/fluid/inference/api/helper.cc +++ b/paddle/fluid/inference/api/helper.cc @@ -13,6 +13,9 @@ // limitations under the License. #include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/extension/include/ext_op_meta_info.h" +#include "paddle/fluid/framework/custom_operator.h" +#include "paddle/fluid/framework/operator.h" namespace paddle { namespace inference { @@ -40,5 +43,20 @@ std::string to_string>>( return ss.str(); } +void RegisterAllCustomOperator() { + auto &op_meta_info_map = OpMetaInfoMap::Instance(); + const auto &meta_info_map = op_meta_info_map.GetMap(); + for (auto &pair : meta_info_map) { + const auto &all_op_kernels{framework::OperatorWithKernel::AllOpKernels()}; + if (all_op_kernels.find(pair.first) == all_op_kernels.end()) { + framework::RegisterOperatorWithMetaInfo(pair.second); + } else { + LOG(INFO) << "The operator `" << pair.first + << "` has been registered. " + "Therefore, we will not repeat the registration here."; + } + } +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 14b968f5834da8618f6af16aa8c25e1d1baaae5e..c6d25137594b76a1ff67d9fb25b2480372c3eefa 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -398,5 +398,7 @@ static bool IsFileExists(const std::string &path) { return exists; } +void RegisterAllCustomOperator(); + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc index 793fc53d90b768050572a3dd0a080a5d30e959a2..f6cdbb00b50453d4c4ff7fc06ba82aa042dd194a 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer.cc +++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc @@ -411,7 +411,8 @@ void AnalysisPredictor::MkldnnQuantizer::ClearDeviceContext() const { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::MKLDNNDeviceContext* dev_ctx = (platform::MKLDNNDeviceContext*)pool.Get(predictor_.place_); - dev_ctx->ResetBlobMap(); + dev_ctx->ResetBlobMap( + paddle::platform::MKLDNNDeviceContext::tls().get_curr_exec()); } void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const { diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index e492b32cb6cbefcc121b616450170e5cc22bb913..2bbd4bb837a22f672e5aa625f299424b6f0c5b88 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -177,7 +177,10 @@ struct PD_INFER_DECL AnalysisConfig { /// void DisableGpu(); - void EnableXpu(int l3_workspace_size = 0xfffc00); + void EnableXpu(int l3_workspace_size = 0xfffc00, bool locked = false, + bool autotune = true, const std::string& autotune_file = "", + const std::string& precision = "int16", + bool adaptive_seqlen = false); /// /// \brief A boolean state telling whether the GPU is turned on. /// @@ -360,6 +363,9 @@ struct PD_INFER_DECL AnalysisConfig { /// bool tensorrt_dla_enabled() { return trt_use_dla_; } + void EnableDlnne(int min_subgraph_size = 3); + bool dlnne_enabled() const { return use_dlnne_; } + /// /// \brief Turn on the usage of Lite sub-graph engine. /// @@ -627,6 +633,10 @@ struct PD_INFER_DECL AnalysisConfig { std::vector trt_disabled_ops_{}; bool disable_trt_plugin_fp16_{false}; + // dlnne related. + bool use_dlnne_{false}; + int dlnne_min_subgraph_size_{3}; + // memory reuse related. bool enable_memory_optim_{false}; @@ -661,6 +671,11 @@ struct PD_INFER_DECL AnalysisConfig { bool thread_local_stream_{false}; bool use_xpu_{false}; int xpu_l3_workspace_size_; + bool xpu_locked_; + bool xpu_autotune_; + std::string xpu_autotune_file_; + std::string xpu_precision_; + bool xpu_adaptive_seqlen_; // mkldnn related. int mkldnn_cache_capacity_{0}; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 61fcdb7a90830da06bd41de0d97b8413d5f1f0ff..b2e3de63691c555b24eb6f1e1fb9ffcc35d400f9 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -86,6 +86,7 @@ const std::vector kTRTSubgraphPasses({ "simplify_with_basic_ops_pass", // "embedding_eltwise_layernorm_fuse_pass", // "multihead_matmul_fuse_pass_v2", // + "multihead_matmul_fuse_pass_v3", // "skip_layernorm_fuse_pass", // "conv_bn_fuse_pass", // "unsqueeze2_eltwise_fuse_pass", // @@ -109,6 +110,16 @@ const std::vector kTRTSubgraphPasses({ "transpose_flatten_concat_fuse_pass", }); +const std::vector kDlnneSubgraphPasses({ + "is_test_pass", // + "delete_dropout_op_pass" // + "simplify_with_basic_ops_pass", // + "conv_bn_fuse_pass", // + "depthwise_conv_bn_fuse_pass", // + "shuffle_channel_detect_pass", // + "dlnne_subgraph_pass", // +}); + const std::vector kLiteSubgraphPasses({ #ifdef PADDLE_WITH_LITE "lite_subgraph_pass", @@ -235,8 +246,8 @@ void CpuPassStrategy::EnableMKLDNN() { "reshape_transpose_matmul_mkldnn_fuse_pass", // "matmul_transpose_reshape_fuse_pass", // // Disabled due to topology-dependent speed-up - //"fc_mkldnn_pass", - //"fc_act_mkldnn_fuse_pass", + // "fc_mkldnn_pass", + // "fc_act_mkldnn_fuse_pass", "batch_norm_act_fuse_pass", // TODO(intel): Please fix the bug on windows. // https://github.com/PaddlePaddle/Paddle/issues/29710 diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index a725ebab35eadaaaab76a3a7c4580f95b64d827d..d7556b50031b7d63b75e1e0d12fa173f8fe9fd33 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -242,6 +242,9 @@ class PD_INFER_DECL XpuPassStrategy final : public PassStrategy { /// \brief List of tensorRT subgraph passes. PD_INFER_DECL extern const std::vector kTRTSubgraphPasses; +/// \brief List of dlnne subgraph passes. +PD_INFER_DECL extern const std::vector kDlnneSubgraphPasses; + /// \brief List of lite subgraph passes. PD_INFER_DECL extern const std::vector kLiteSubgraphPasses; diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc index 231639667244d8646faa94ca453227de923c5814..9bb52ba57802512f393c23f957cc38ddabb878b1 100644 --- a/paddle/fluid/inference/capi/pd_config.cc +++ b/paddle/fluid/inference/capi/pd_config.cc @@ -260,6 +260,22 @@ bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) { return config->config.tensorrt_engine_enabled(); } +void PD_EnableDlnne(PD_AnalysisConfig* config, int min_subgraph_size) { + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); + config->config.EnableDlnne(min_subgraph_size); +} + +bool PD_DlnneEnabled(const PD_AnalysisConfig* config) { + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); + return config->config.dlnne_enabled(); +} + void PD_SwitchIrDebug(PD_AnalysisConfig* config, bool x) { PADDLE_ENFORCE_NOT_NULL( config, diff --git a/paddle/fluid/inference/capi/pd_predictor.cc b/paddle/fluid/inference/capi/pd_predictor.cc index c1bf4c974fac8c80c3e8e31fbd247332a325e2aa..c4e195b6ec8fabb3b831b37fd9b46b3d81a92371 100644 --- a/paddle/fluid/inference/capi/pd_predictor.cc +++ b/paddle/fluid/inference/capi/pd_predictor.cc @@ -207,13 +207,16 @@ int PD_GetOutputNum(const PD_Predictor* predictor) { } const char* PD_GetInputName(const PD_Predictor* predictor, int n) { - static std::vector names = predictor->predictor->GetInputNames(); + static std::vector names; + names.resize(predictor->predictor->GetInputNames().size()); + names[n] = predictor->predictor->GetInputNames()[n]; return names[n].c_str(); } const char* PD_GetOutputName(const PD_Predictor* predictor, int n) { - static std::vector names = - predictor->predictor->GetOutputNames(); + static std::vector names; + names.resize(predictor->predictor->GetOutputNames().size()); + names[n] = predictor->predictor->GetOutputNames()[n]; return names[n].c_str(); } diff --git a/paddle/fluid/inference/capi_exp/CMakeLists.txt b/paddle/fluid/inference/capi_exp/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..521d24329d46411a8674ebe783b77f5a585a6551 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/CMakeLists.txt @@ -0,0 +1,29 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set(C_API_SRCS pd_config.cc pd_predictor.cc pd_tensor.cc pd_utils.cc) + +cc_library(paddle_inference_c SRCS ${C_API_SRCS} DEPS paddle_inference) + +if(NOT ON_INFER) + return() +endif() + +# Create inference capi shared library +cc_library(paddle_inference_c_shared SHARED SRCS ${C_API_SRCS} DEPS paddle_inference) +set_target_properties(paddle_inference_c_shared PROPERTIES OUTPUT_NAME paddle_inference_c) +if(WIN32) + target_link_libraries(paddle_inference_c_shared shlwapi.lib) +endif() diff --git a/paddle/fluid/inference/capi_exp/lod_demo.cc b/paddle/fluid/inference/capi_exp/lod_demo.cc new file mode 100644 index 0000000000000000000000000000000000000000..2b049e992e71dd64c6616b2ac5c951ee10ea7909 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/lod_demo.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// +/// \file lod_demo.cc +/// +/// \brief a demo for user to learn how to inference by c api. +/// it rectify from +/// paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc. +/// +/// \author paddle-infer@baidu.com +/// \date 2021-04-21 +/// \since 2.1 +/// + +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" + +int main(int argc, char *argv[]) { + auto model_dir = FLAGS_infer_model; + PD_Config *config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/param").c_str()); + PD_ConfigDisableGpu(config); + + PD_Predictor *predictor = PD_PredictorCreate(config); + size_t input_num = PD_PredictorGetInputNum(predictor); + size_t output_num = PD_PredictorGetOutputNum(predictor); + + PD_OneDimArrayCstr *input_names = PD_PredictorGetInputNames(predictor); + LOG(INFO) << "Predictor start run!"; + PD_Tensor *inputs[2]; + inputs[0] = PD_PredictorGetInputHandle(predictor, input_names->data[0]); + inputs[1] = PD_PredictorGetInputHandle(predictor, input_names->data[1]); + LOG(INFO) << "Predictor start run!"; + // inputs[0]: word, use lod memory in stack + int32_t shape_0[2] = {11, 1}; + int64_t data_0[11 * 1] = {12673, 9763, 905, 284, 45, 7474, 20, 17, 1, 4, 9}; + size_t lod_layer_0[2] = {0, 11}; + PD_OneDimArraySize layer_0; + layer_0.size = 2; + layer_0.data = lod_layer_0; + PD_OneDimArraySize *layer_0_ptr = &layer_0; + PD_TwoDimArraySize lod_0; + lod_0.size = 1; + lod_0.data = &layer_0_ptr; + PD_TensorReshape(inputs[0], 2, shape_0); + PD_TensorCopyFromCpuInt64(inputs[0], data_0); + PD_TensorSetLod(inputs[0], &lod_0); + + // inputs[1]: mention, use lod memory in heap + int32_t shape_1[2] = {11, 1}; + int64_t data_1[11 * 1] = {27, 0, 0, 33, 34, 33, 0, 0, 0, 1, 2}; + PD_TwoDimArraySize *lod_1_ptr = new PD_TwoDimArraySize(); + lod_1_ptr->size = 1; + lod_1_ptr->data = new PD_OneDimArraySize *[1]; + lod_1_ptr->data[0] = new PD_OneDimArraySize(); + lod_1_ptr->data[0]->size = 2; + lod_1_ptr->data[0]->data = new size_t[2]; + lod_1_ptr->data[0]->data[0] = 0; + lod_1_ptr->data[0]->data[1] = 11; + + PD_TensorReshape(inputs[1], 2, shape_1); + PD_TensorCopyFromCpuInt64(inputs[1], data_1); + PD_TensorSetLod(inputs[1], lod_1_ptr); + // retrieve the lod memory + delete[] lod_1_ptr->data[0]->data; + delete lod_1_ptr->data[0]; + delete[] lod_1_ptr->data; + delete lod_1_ptr; + lod_1_ptr = nullptr; + + PD_PredictorRun(predictor); + PD_OneDimArrayCstr *output_names = PD_PredictorGetOutputNames(predictor); + PD_Tensor *output = + PD_PredictorGetOutputHandle(predictor, output_names->data[0]); + PD_TwoDimArraySize *output_lod = PD_TensorGetLod(output); + + PD_TwoDimArraySizeDestroy(output_lod); + PD_TensorDestroy(output); + PD_OneDimArrayCstrDestroy(output_names); + + PD_TensorDestroy(inputs[0]); + PD_TensorDestroy(inputs[1]); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} diff --git a/paddle/fluid/inference/capi_exp/pd_common.h b/paddle/fluid/inference/capi_exp/pd_common.h new file mode 100644 index 0000000000000000000000000000000000000000..4b70ed7fbad297efdf1863317e3af2b69bed702b --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_common.h @@ -0,0 +1,75 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#if defined(_WIN32) +#ifdef PADDLE_DLL_INFERENCE +#define PADDLE_CAPI_EXPORT __declspec(dllexport) +#else +#define PADDLE_CAPI_EXPORT __declspec(dllimport) +#endif // PADDLE_DLL_INFERENCE +#else +#define PADDLE_CAPI_EXPORT __attribute__((visibility("default"))) +#endif // _WIN32 + +/// +/// __pd_give means that a new object is returned. The user should make sure +/// that the returned pointer is used exactly once as a value for an __pd_take +/// argument. In between, it can be used as a value for as many __pd_keep +/// arguments as the user likes. +/// +#ifndef __pd_give +#define __pd_give +#endif +/// +/// __pd_take means that the object the argument points to is taken over by the +/// function and may no longer be used by the user as an argument to any other +/// function. The pointer value must be one returned by a function returning an +/// __pd_give pointer. +/// +#ifndef __pd_take +#define __pd_take +#endif +/// +/// __pd_keep means that the function will only use the object temporarily. The +/// object which the argument points to is not taken over by the function. After +/// the function has finished, the user can still use it as an argument to other +/// functions. +/// +#ifndef __pd_keep +#define __pd_keep +#endif + +typedef int8_t PD_Bool; +#define TRUE 1 +#define FALSE 0 + +#define PD_ENUM(type) \ + typedef int32_t type; \ + enum + +PD_ENUM(PD_PrecisionType){PD_PRECISION_FLOAT32 = 0, PD_PRECISION_INT8, + PD_PRECISION_HALF}; + +PD_ENUM(PD_PlaceType){PD_PLACE_UNK = -1, PD_PLACE_CPU, PD_PLACE_GPU, + PD_PLACE_XPU}; + +PD_ENUM(PD_DataType){ + PD_DATA_UNK = -1, PD_DATA_FLOAT32, PD_DATA_INT32, + PD_DATA_INT64, PD_DATA_UINT8, +}; diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc new file mode 100644 index 0000000000000000000000000000000000000000..c45454e86bdaac5e8f054da91410eab7e2b873a2 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_config.cc @@ -0,0 +1,382 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/capi_exp/pd_config.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/enforce.h" + +#define CHECK_NULL_POINTER_PARM(param) \ + PADDLE_ENFORCE_NOT_NULL( \ + param, paddle::platform::errors::InvalidArgument( \ + "The pointer of " #param " shouldn't be nullptr")) + +#define CHECK_AND_CONVERT_PD_CONFIG \ + PADDLE_ENFORCE_NOT_NULL( \ + pd_config, paddle::platform::errors::InvalidArgument( \ + "The pointer of paddle config shouldn't be nullptr")); \ + Config* config = reinterpret_cast(pd_config) + +using paddle_infer::Config; + +static Config::Precision ConvertToCxxPrecisionType(PD_PrecisionType precision) { + switch (precision) { + case PD_PRECISION_FLOAT32: + return Config::Precision::kFloat32; + case PD_PRECISION_INT8: + return Config::Precision::kInt8; + case PD_PRECISION_HALF: + return Config::Precision::kHalf; + default: + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Unsupport paddle precision type %d.", precision)); + return Config::Precision::kFloat32; + } +} + +extern "C" { +__pd_give PD_Config* PD_ConfigCreate() { + return reinterpret_cast(new Config()); +} + +void PD_ConfigDestroy(__pd_take PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + delete reinterpret_cast(config); +} + +void PD_ConfigSetModel(__pd_keep PD_Config* pd_config, + const char* prog_file_path, + const char* params_file_path) { + CHECK_AND_CONVERT_PD_CONFIG; + CHECK_NULL_POINTER_PARM(prog_file_path); + CHECK_NULL_POINTER_PARM(params_file_path); + config->SetModel(prog_file_path, params_file_path); +} +void PD_ConfigSetProgFile(__pd_keep PD_Config* pd_config, + const char* prog_file_path) { + CHECK_AND_CONVERT_PD_CONFIG; + CHECK_NULL_POINTER_PARM(prog_file_path); + config->SetProgFile(prog_file_path); +} +void PD_ConfigSetParamsFile(__pd_keep PD_Config* pd_config, + const char* params_file_path) { + CHECK_AND_CONVERT_PD_CONFIG; + CHECK_NULL_POINTER_PARM(params_file_path); + config->SetParamsFile(params_file_path); +} +void PD_ConfigSetOptimCacheDir(__pd_keep PD_Config* pd_config, + const char* opt_cache_dir) { + CHECK_AND_CONVERT_PD_CONFIG; + CHECK_NULL_POINTER_PARM(opt_cache_dir); + config->SetOptimCacheDir(opt_cache_dir); +} + +void PD_ConfigSetModelDir(__pd_keep PD_Config* pd_config, + const char* model_dir) { + CHECK_AND_CONVERT_PD_CONFIG; + CHECK_NULL_POINTER_PARM(model_dir); + config->SetModel(model_dir); +} +const char* PD_ConfigGetModelDir(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->model_dir().c_str(); +} +const char* PD_ConfigGetProgFile(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->prog_file().c_str(); +} +const char* PD_ConfigGetParamsFile(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->params_file().c_str(); +} + +void PD_ConfigDisableFCPadding(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->DisableFCPadding(); +} +PD_Bool PD_ConfigUseFcPadding(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->use_fc_padding(); +} + +void PD_ConfigEnableUseGpu(__pd_keep PD_Config* pd_config, + uint64_t memory_pool_init_size_mb, + int32_t device_id) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableUseGpu(memory_pool_init_size_mb, device_id); +} +void PD_ConfigDisableGpu(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->DisableGpu(); +} +PD_Bool PD_ConfigUseGpu(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->use_gpu(); +} + +void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config, + int32_t l3_workspace_size) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableXpu(l3_workspace_size); +} +PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->use_xpu(); +} + +int32_t PD_ConfigGpuDeviceId(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->gpu_device_id(); +} +int32_t PD_ConfigXpuDeviceId(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->xpu_device_id(); +} +int32_t PD_ConfigMemoryPoolInitSizeMb(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->memory_pool_init_size_mb(); +} +float PD_ConfigFractionOfGpuMemoryForPool(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->fraction_of_gpu_memory_for_pool(); +} +void PD_ConfigEnableCudnn(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableCUDNN(); +} +PD_Bool PD_ConfigCudnnEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->cudnn_enabled(); +} + +void PD_ConfigSwitchIrOptim(__pd_keep PD_Config* pd_config, PD_Bool x) { + CHECK_AND_CONVERT_PD_CONFIG; + config->SwitchIrOptim(x); +} +PD_Bool PD_ConfigIrOptim(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->ir_optim(); +} + +void PD_ConfigEnableTensorRtEngine(__pd_keep PD_Config* pd_config, + int32_t workspace_size, + int32_t max_batch_size, + int32_t min_subgraph_size, + PD_PrecisionType precision, + PD_Bool use_static, PD_Bool use_calib_mode) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableTensorRtEngine( + workspace_size, max_batch_size, min_subgraph_size, + ConvertToCxxPrecisionType(precision), use_static, use_calib_mode); +} +PD_Bool PD_ConfigTensorRtEngineEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->tensorrt_engine_enabled(); +} + +void PD_ConfigSetTrtDynamicShapeInfo(__pd_keep PD_Config* pd_config, + size_t tensor_num, + const char** tensor_name, + size_t* shapes_num, int32_t** min_shape, + int32_t** max_shape, int32_t** optim_shape, + PD_Bool disable_trt_plugin_fp16) { + CHECK_AND_CONVERT_PD_CONFIG; + std::map> min_input_shapes; + std::map> max_input_shapes; + std::map> optim_input_shapes; + for (size_t tensor_index = 0; tensor_index < tensor_num; ++tensor_index) { + std::string name(tensor_name[tensor_index]); + std::vector min_input_shape, max_input_shape, optim_input_shape; + for (size_t shape_index = 0; shape_index < shapes_num[tensor_index]; + ++shape_index) { + min_input_shape.emplace_back(min_shape[tensor_index][shape_index]); + max_input_shape.emplace_back(max_shape[tensor_index][shape_index]); + optim_input_shape.emplace_back(optim_shape[tensor_index][shape_index]); + } + min_input_shapes[name] = std::move(min_input_shape); + max_input_shapes[name] = std::move(max_input_shape); + optim_input_shapes[name] = std::move(optim_input_shape); + } + config->SetTRTDynamicShapeInfo(min_input_shapes, max_input_shapes, + optim_input_shapes, disable_trt_plugin_fp16); +} + +void PD_ConfigDisableTensorRtOPs(__pd_keep PD_Config* pd_config, size_t ops_num, + const char** ops_name) { + CHECK_AND_CONVERT_PD_CONFIG; + std::vector ops_list; + for (size_t index = 0; index < ops_num; ++index) { + ops_list.emplace_back(ops_name[index]); + } + config->Exp_DisableTensorRtOPs(ops_list); +} + +void PD_ConfigEnableTensorRtOSS(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableTensorRtOSS(); +} +PD_Bool PD_ConfigTensorRtOssEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->tensorrt_oss_enabled(); +} + +void PD_ConfigEnableTensorRtDla(__pd_keep PD_Config* pd_config, + int32_t dla_core) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableTensorRtDLA(dla_core); +} +PD_Bool PD_ConfigTensorRtDlaEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->tensorrt_dla_enabled(); +} + +void PD_ConfigEnableLiteEngine(__pd_keep PD_Config* pd_config, + PD_PrecisionType precision, PD_Bool zero_copy, + size_t passes_filter_num, + const char** passes_filter, + size_t ops_filter_num, const char** ops_filter) { + CHECK_AND_CONVERT_PD_CONFIG; + std::vector passes_filters, ops_filters; + for (size_t index = 0; index < passes_filter_num; ++index) { + passes_filters.emplace_back(passes_filter[index]); + } + for (size_t index = 0; index < ops_filter_num; ++index) { + ops_filters.emplace_back(ops_filter[index]); + } + config->EnableLiteEngine(ConvertToCxxPrecisionType(precision), zero_copy, + passes_filters, ops_filters); +} +PD_Bool PD_ConfigLiteEngineEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->lite_engine_enabled(); +} + +void PD_ConfigSwitchIrDebug(__pd_keep PD_Config* pd_config, PD_Bool x) { + CHECK_AND_CONVERT_PD_CONFIG; + config->SwitchIrDebug(x); +} +void PD_ConfigEnableMKLDNN(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableMKLDNN(); +} +void PD_ConfigSetMkldnnCacheCapacity(__pd_keep PD_Config* pd_config, + int32_t capacity) { + CHECK_AND_CONVERT_PD_CONFIG; + config->SetMkldnnCacheCapacity(capacity); +} +PD_Bool PD_ConfigMkldnnEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->mkldnn_enabled(); +} +void PD_ConfigSetCpuMathLibraryNumThreads( + __pd_keep PD_Config* pd_config, int32_t cpu_math_library_num_threads) { + CHECK_AND_CONVERT_PD_CONFIG; + config->SetCpuMathLibraryNumThreads(cpu_math_library_num_threads); +} +int32_t PD_ConfigGetCpuMathLibraryNumThreads(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->cpu_math_library_num_threads(); +} + +void PD_ConfigSetMkldnnOp(__pd_keep PD_Config* pd_config, size_t ops_num, + const char** op_list) { + CHECK_AND_CONVERT_PD_CONFIG; + std::unordered_set op_names; + for (size_t index = 0; index < ops_num; ++index) { + op_names.emplace(op_list[index]); + } + config->SetMKLDNNOp(std::move(op_names)); +} +void PD_ConfigEnableMkldnnQuantizer(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableMkldnnQuantizer(); +} +void PD_ConfigEnableMkldnnBfloat16(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableMkldnnBfloat16(); +} +PD_Bool PD_ConfigMkldnnBfloat16Enabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->mkldnn_bfloat16_enabled(); +} +void PD_ConfigSetBfloat16Op(__pd_keep PD_Config* pd_config, size_t ops_num, + const char** op_list) { + CHECK_AND_CONVERT_PD_CONFIG; + std::unordered_set op_names; + for (size_t index = 0; index < ops_num; ++index) { + op_names.emplace(op_list[index]); + } + config->SetBfloat16Op(std::move(op_names)); +} +PD_Bool PD_ConfigThreadLocalStreamEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->thread_local_stream_enabled(); +} +PD_Bool PD_ConfigMkldnnQuantizerEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->mkldnn_quantizer_enabled(); +} +void PD_ConfigSetModelBuffer(__pd_keep PD_Config* pd_config, + const char* prog_buffer, size_t prog_buffer_size, + const char* params_buffer, + size_t params_buffer_size) { + CHECK_AND_CONVERT_PD_CONFIG; + config->SetModelBuffer(prog_buffer, prog_buffer_size, params_buffer, + params_buffer_size); +} +PD_Bool PD_ConfigModelFromMemory(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->model_from_memory(); +} +void PD_ConfigEnableMemoryOptim(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableMemoryOptim(); +} +PD_Bool PD_ConfigMemoryOptimEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->enable_memory_optim(); +} +void PD_ConfigEnableProfile(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableProfile(); +} +PD_Bool PD_ConfigProfileEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->profile_enabled(); +} +void PD_ConfigDisableGlogInfo(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->DisableGlogInfo(); +} +PD_Bool PD_ConfigGlogInfoDisabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->glog_info_disabled(); +} +void PD_ConfigSetInvalid(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->SetInValid(); +} +PD_Bool PD_ConfigIsValid(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->is_valid(); +} +void PD_ConfigEnableGpuMultiStream(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableGpuMultiStream(); +} +void PD_ConfigPartiallyRelease(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->PartiallyRelease(); +} + +} // extern "C" diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h new file mode 100644 index 0000000000000000000000000000000000000000..e44983e24484eae930afa6b84db397ac3aad8f08 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_config.h @@ -0,0 +1,571 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// +/// \file pd_config.h +/// +/// \brief interface for paddle config +/// +/// \author paddle-infer@baidu.com +/// \date 2021-04-21 +/// \since 2.1 +/// + +#pragma once + +#include "pd_common.h" // NOLINT + +typedef struct PD_Config PD_Config; + +#ifdef __cplusplus +extern "C" { +#endif + +/// +/// \brief Create a paddle config +/// +/// \return new config. +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_Config* PD_ConfigCreate(); +/// +/// \brief Destroy the paddle config +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigDestroy(__pd_take PD_Config* pd_config); +/// +/// \brief Set the combined model with two specific pathes for program and +/// parameters. +/// +/// \param[in] pd_onfig config +/// \param[in] prog_file_path model file path of the combined model. +/// \param[in] params_file_path params file path of the combined model. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetModel(__pd_keep PD_Config* pd_config, + const char* prog_file_path, + const char* params_file_path); +/// +/// \brief Set the model file path of a combined model. +/// +/// \param[in] pd_onfig config +/// \param[in] prog_file_path model file path. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetProgFile( + __pd_keep PD_Config* pd_config, const char* prog_file_path); +/// +/// \brief Set the params file path of a combined model. +/// +/// \param[in] pd_onfig config +/// \param[in] params_file_path params file path. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetParamsFile( + __pd_keep PD_Config* pd_config, const char* params_file_path); +/// +/// \brief Set the path of optimization cache directory. +/// \param[in] pd_onfig config +/// \param[in] opt_cache_dir the path of optimization cache directory. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetOptimCacheDir( + __pd_keep PD_Config* pd_config, const char* opt_cache_dir); +/// +/// \brief Set the no-combined model dir path. +/// \param[in] pd_onfig config +/// \param[in] model_dir model dir path. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetModelDir( + __pd_keep PD_Config* pd_config, const char* model_dir); +/// +/// \brief Get the model directory path. +/// +/// \param[in] pd_onfig config +/// \return The model directory path. +/// +PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetModelDir( + __pd_keep PD_Config* pd_config); +/// +/// \brief Get the program file path. +/// +/// \param[in] pd_onfig config +/// \return The program file path. +/// +PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetProgFile( + __pd_keep PD_Config* pd_config); +/// +/// \brief Get the params file path. +/// +/// \param[in] pd_onfig config +/// \return The params file path. +/// +PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetParamsFile( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn off FC Padding. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigDisableFCPadding( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether fc padding is used. +/// +/// \param[in] pd_onfig config +/// \return Whether fc padding is used. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseFcPadding( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on GPU. +/// +/// \param[in] pd_onfig config +/// \param[in] memory_pool_init_size_mb initial size of the GPU memory pool in +/// MB. +/// \param[in] device_id device_id the GPU card to use. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableUseGpu( + __pd_keep PD_Config* pd_config, uint64_t memory_pool_init_size_mb, + int32_t device_id); +/// +/// \brief Turn off GPU. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGpu( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the GPU is turned on. +/// +/// \brief Turn off GPU. +/// \return Whether the GPU is turned on. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseGpu( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on XPU. +/// +/// \param[in] pd_onfig config +/// \param[in] l3_workspace_size l3 workspace size. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu( + __pd_keep PD_Config* pd_config, int32_t l3_workspace_size); +/// +/// \brief A boolean state telling whether the XPU is turned on. +/// +/// \param[in] pd_onfig config +/// \return Whether the XPU is turned on. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseXpu( + __pd_keep PD_Config* pd_config); +/// +/// \brief Get the GPU device id. +/// +/// \param[in] pd_onfig config +/// \return The GPU device id. +/// +PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGpuDeviceId( + __pd_keep PD_Config* pd_config); +/// +/// \brief Get the XPU device id. +/// +/// \param[in] pd_onfig config +/// \return The XPU device id. +/// +PADDLE_CAPI_EXPORT extern int32_t PD_ConfigXpuDeviceId( + __pd_keep PD_Config* pd_config); +/// +/// \brief Get the initial size in MB of the GPU memory pool. +/// +/// \param[in] pd_onfig config +/// \return The initial size in MB of the GPU memory pool. +/// +PADDLE_CAPI_EXPORT extern int32_t PD_ConfigMemoryPoolInitSizeMb( + __pd_keep PD_Config* pd_config); +/// +/// \brief Get the proportion of the initial memory pool size compared to the +/// device. +/// +/// \param[in] pd_onfig config +/// \return The proportion of the initial memory pool size. +/// +PADDLE_CAPI_EXPORT extern float PD_ConfigFractionOfGpuMemoryForPool( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on CUDNN. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableCudnn( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether to use CUDNN. +/// +/// \param[in] pd_onfig config +/// \return Whether to use CUDNN. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigCudnnEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Control whether to perform IR graph optimization. +/// If turned off, the AnalysisConfig will act just like a NativeConfig. +/// +/// \param[in] pd_onfig config +/// \param[in] x Whether the ir graph optimization is actived. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSwitchIrOptim( + __pd_keep PD_Config* pd_config, PD_Bool x); +/// +/// \brief A boolean state telling whether the ir graph optimization is +/// actived. +/// +/// \param[in] pd_onfig config +/// \return Whether to use ir graph optimization. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigIrOptim( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on the TensorRT engine. +/// The TensorRT engine will accelerate some subgraphes in the original Fluid +/// computation graph. In some models such as resnet50, GoogleNet and so on, +/// it gains significant performance acceleration. +/// +/// \param[in] pd_onfig config +/// \param[in] workspace_size The memory size(in byte) used for TensorRT +/// workspace. +/// \param[in] max_batch_size The maximum batch size of this prediction task, +/// better set as small as possible for less performance loss. +/// \param[in] min_subgrpah_size The minimum TensorRT subgraph size needed, if a +/// subgraph is smaller than this, it will not be transferred to TensorRT +/// engine. +/// \param[in] precision The precision used in TensorRT. +/// \param[in] use_static Serialize optimization information to disk for +/// reusing. +/// \param[in] use_calib_mode Use TRT int8 calibration(post training +/// quantization). +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtEngine( + __pd_keep PD_Config* pd_config, int32_t workspace_size, + int32_t max_batch_size, int32_t min_subgraph_size, + PD_PrecisionType precision, PD_Bool use_static, PD_Bool use_calib_mode); +/// +/// \brief A boolean state telling whether the TensorRT engine is used. +/// +/// \param[in] pd_onfig config +/// \return Whether the TensorRT engine is used. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtEngineEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Set min, max, opt shape for TensorRT Dynamic shape mode. +/// +/// \param[in] pd_onfig config +/// \param[in] tensor_num The number of the subgraph input. +/// \param[in] tensor_name The name of every subgraph input. +/// \param[in] shapes_num The shape size of every subgraph input. +/// \param[in] min_shape The min input shape of every subgraph input. +/// \param[in] max_shape The max input shape of every subgraph input. +/// \param[in] optim_shape The opt input shape of every subgraph input. +/// \param[in] disable_trt_plugin_fp16 Setting this parameter to true means that +/// TRT plugin will not run fp16. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetTrtDynamicShapeInfo( + __pd_keep PD_Config* pd_config, size_t tensor_num, const char** tensor_name, + size_t* shapes_num, int32_t** min_shape, int32_t** max_shape, + int32_t** optim_shape, PD_Bool disable_trt_plugin_fp16); +/// +/// \brief Prevent ops running in Paddle-TRT +/// NOTE: just experimental, not an official stable API, easy to be broken. +/// +/// \param[in] pd_onfig config +/// \param[in] ops_num ops number +/// \param[in] ops_name ops name +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigDisableTensorRtOPs( + __pd_keep PD_Config* pd_config, size_t ops_num, const char** ops_name); +/// +/// \brief Replace some TensorRT plugins to TensorRT OSS( +/// https://github.com/NVIDIA/TensorRT), with which some models's inference +/// may be more high-performance. Libnvinfer_plugin.so greater than +/// V7.2.1 is needed. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtOSS( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether to use the TensorRT OSS. +/// +/// \param[in] pd_onfig config +/// \return Whether to use the TensorRT OSS. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtOssEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Enable TensorRT DLA +/// +/// \param[in] pd_onfig config +/// \param[in] dla_core ID of DLACore, which should be 0, 1, +/// ..., IBuilder.getNbDLACores() - 1 +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtDla( + __pd_keep PD_Config* pd_config, int32_t dla_core); +/// +/// \brief A boolean state telling whether to use the TensorRT DLA. +/// +/// \param[in] pd_onfig config +/// \return Whether to use the TensorRT DLA. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtDlaEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on the usage of Lite sub-graph engine. +/// +/// \param[in] pd_onfig config +/// \param[in] precision Precion used in Lite sub-graph engine. +/// \param[in] zero_copy whether use zero copy. +/// \param[in] passes_filter_num The number of passes used in Lite sub-graph +/// engine. +/// \param[in] passes_filter The name of passes used in Lite sub-graph engine. +/// \param[in] ops_filter_num The number of operators not supported by Lite. +/// \param[in] ops_filter The name of operators not supported by Lite. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableLiteEngine( + __pd_keep PD_Config* pd_config, PD_PrecisionType precision, + PD_Bool zero_copy, size_t passes_filter_num, const char** passes_filter, + size_t ops_filter_num, const char** ops_filter); +/// +/// \brief A boolean state indicating whether the Lite sub-graph engine is +/// used. +/// +/// \param[in] pd_onfig config +/// \return Whether the Lite sub-graph engine is used. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigLiteEngineEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Control whether to debug IR graph analysis phase. +/// This will generate DOT files for visualizing the computation graph after +/// each analysis pass applied. +/// +/// \param[in] pd_onfig config +/// \param[in] x whether to debug IR graph analysis phase. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSwitchIrDebug( + __pd_keep PD_Config* pd_config, PD_Bool x); +/// +/// \brief Turn on MKLDNN. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMKLDNN( + __pd_keep PD_Config* pd_config); +/// +/// \brief Set the cache capacity of different input shapes for MKLDNN. +/// Default value 0 means not caching any shape. +/// Please see MKL-DNN Data Caching Design Document: +/// https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/mkldnn/caching/caching.md +/// +/// \param[in] pd_onfig config +/// \param[in] capacity The cache capacity. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetMkldnnCacheCapacity( + __pd_keep PD_Config* pd_config, int32_t capacity); +/// +/// \brief A boolean state telling whether to use the MKLDNN. +/// +/// \param[in] pd_onfig config +/// \return Whether to use the MKLDNN. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Set the number of cpu math library threads. +/// +/// \param[in] pd_onfig config +/// \param cpu_math_library_num_threads The number of cpu math library +/// threads. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetCpuMathLibraryNumThreads( + __pd_keep PD_Config* pd_config, int32_t cpu_math_library_num_threads); +/// +/// \brief An int state telling how many threads are used in the CPU math +/// library. +/// +/// \param[in] pd_onfig config +/// \return The number of threads used in the CPU math library. +/// +PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGetCpuMathLibraryNumThreads( + __pd_keep PD_Config* pd_config); +/// +/// \brief Specify the operator type list to use MKLDNN acceleration. +/// +/// \param[in] pd_onfig config +/// \param[in] ops_num The number of operator type list. +/// \param[in] op_list The name of operator type list. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetMkldnnOp( + __pd_keep PD_Config* pd_config, size_t ops_num, const char** op_list); +/// +/// \brief Turn on MKLDNN quantization. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMkldnnQuantizer( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the MKLDNN quantization is enabled. +/// +/// \param[in] pd_onfig config +/// \return Whether the MKLDNN quantization is enabled. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnQuantizerEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on MKLDNN bfloat16. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMkldnnBfloat16( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether to use the MKLDNN Bfloat16. +/// +/// \param[in] pd_onfig config +/// \return Whether to use the MKLDNN Bfloat16. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnBfloat16Enabled( + __pd_keep PD_Config* pd_config); +/// \brief Specify the operator type list to use Bfloat16 acceleration. +/// +/// \param[in] pd_onfig config +/// \param[in] ops_num The number of operator type list. +/// \param[in] op_list The name of operator type list. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetBfloat16Op( + __pd_keep PD_Config* pd_config, size_t ops_num, const char** op_list); +/// +/// \brief Enable the GPU multi-computing stream feature. +/// NOTE: The current behavior of this interface is to bind the computation +/// stream to the thread, and this behavior may be changed in the future. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableGpuMultiStream( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the thread local CUDA stream is +/// enabled. +/// +/// \param[in] pd_onfig config +/// \return Whether the thread local CUDA stream is enabled. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigThreadLocalStreamEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Specify the memory buffer of program and parameter. +/// Used when model and params are loaded directly from memory. +/// +/// \param[in] pd_onfig config +/// \param[in] prog_buffer The memory buffer of program. +/// \param[in] prog_buffer_size The size of the model data. +/// \param[in] params_buffer The memory buffer of the combined parameters file. +/// \param[in] params_buffer_size The size of the combined parameters data. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetModelBuffer( + __pd_keep PD_Config* pd_config, const char* prog_buffer, + size_t prog_buffer_size, const char* params_buffer, + size_t params_buffer_size); +/// +/// \brief A boolean state telling whether the model is set from the CPU +/// memory. +/// +/// \param[in] pd_onfig config +/// \return Whether model and params are loaded directly from memory. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigModelFromMemory( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on memory optimize +/// NOTE still in development. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMemoryOptim( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the memory optimization is +/// activated. +/// +/// \param[in] pd_onfig config +/// \return Whether the memory optimization is activated. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMemoryOptimEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on profiling report. +/// If not turned on, no profiling report will be generated. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableProfile( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the profiler is activated. +/// +/// \param[in] pd_onfig config +/// \return bool Whether the profiler is activated. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigProfileEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Mute all logs in Paddle inference. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGlogInfo( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether logs in Paddle inference are muted. +/// +/// \param[in] pd_onfig config +/// \return Whether logs in Paddle inference are muted. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigGlogInfoDisabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Set the Config to be invalid. +/// This is to ensure that an Config can only be used in one +/// Predictor. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetInvalid( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the Config is valid. +/// +/// \param[in] pd_onfig config +/// \return Whether the Config is valid. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigIsValid( + __pd_keep PD_Config* pd_config); +/// +/// \brief Partially release the memory +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigPartiallyRelease( + __pd_keep PD_Config* pd_config); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/paddle/fluid/operators/distributed/distributed_pb.h b/paddle/fluid/inference/capi_exp/pd_inference_api.h similarity index 58% rename from paddle/fluid/operators/distributed/distributed_pb.h rename to paddle/fluid/inference/capi_exp/pd_inference_api.h index f1c662be9af67b418e17987e4eb1ff0a2809c3e3..5f21dca1a7bf6a3a74f19cf814b7138d39db8054 100644 --- a/paddle/fluid/operators/distributed/distributed_pb.h +++ b/paddle/fluid/inference/capi_exp/pd_inference_api.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,17 +14,9 @@ #pragma once -#ifdef PADDLE_WITH_DISTRIBUTE - -#ifdef PADDLE_WITH_GRPC - -#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" - -#else // PADDLE_WITH_GRPC - -#include "paddle/fluid/operators/distributed/send_recv.pb.h" - -#endif // PADDLE_WITH_GRPC - -#endif // PADDLE_WITH_DISTRIBUTE +#include "pd_common.h" // NOLINT +#include "pd_config.h" // NOLINT +#include "pd_predictor.h" // NOLINT +#include "pd_tensor.h" // NOLINT +#include "pd_types.h" // NOLINT +#include "pd_utils.h" // NOLINT diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.cc b/paddle/fluid/inference/capi_exp/pd_predictor.cc new file mode 100644 index 0000000000000000000000000000000000000000..f5287a5152957f5cda0db9dee82a7689267cd3d2 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_predictor.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/capi_exp/pd_predictor.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/capi_exp/pd_types.h" +#include "paddle/fluid/inference/capi_exp/pd_utils.h" +#include "paddle/fluid/inference/capi_exp/types_internal.h" +#include "paddle/fluid/inference/capi_exp/utils_internal.h" +#include "paddle/fluid/platform/enforce.h" + +#define CHECK_AND_CONVERT_PD_PREDICTOR \ + PADDLE_ENFORCE_NOT_NULL( \ + pd_predictor, \ + paddle::platform::errors::InvalidArgument( \ + "The pointer of paddle predictor shouldn't be nullptr")); \ + auto& predictor = pd_predictor->predictor + +extern "C" { +__pd_give PD_Predictor* PD_PredictorCreate(__pd_take PD_Config* pd_config) { + PADDLE_ENFORCE_NOT_NULL( + pd_config, paddle::platform::errors::InvalidArgument( + "The pointer of paddle predictor shouldn't be nullptr")); + PD_Predictor* pd_predictor = new PD_Predictor(); + paddle_infer::Config* config = + reinterpret_cast(pd_config); + pd_predictor->predictor = paddle_infer::CreatePredictor(*config); + delete config; + return pd_predictor; +} + +__pd_give PD_Predictor* PD_PredictorClone( + __pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + PD_Predictor* new_predictor = new PD_Predictor(); + new_predictor->predictor = predictor->Clone(); + return new_predictor; +} + +__pd_give PD_OneDimArrayCstr* PD_PredictorGetInputNames( + __pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + std::vector names = predictor->GetInputNames(); + return paddle_infer::CvtVecToOneDimArrayCstr(names); +} + +__pd_give PD_OneDimArrayCstr* PD_PredictorGetOutputNames( + __pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + std::vector names = predictor->GetOutputNames(); + return paddle_infer::CvtVecToOneDimArrayCstr(names); +} + +size_t PD_PredictorGetInputNum(__pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + return predictor->GetInputNames().size(); +} + +size_t PD_PredictorGetOutputNum(__pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + return predictor->GetOutputNames().size(); +} +__pd_give PD_Tensor* PD_PredictorGetInputHandle( + __pd_keep PD_Predictor* pd_predictor, const char* name) { + CHECK_AND_CONVERT_PD_PREDICTOR; + PD_Tensor* pd_tensor = new PD_Tensor(); + pd_tensor->tensor = predictor->GetInputHandle(name); + return pd_tensor; +} + +__pd_give PD_Tensor* PD_PredictorGetOutputHandle( + __pd_keep PD_Predictor* pd_predictor, const char* name) { + CHECK_AND_CONVERT_PD_PREDICTOR; + PD_Tensor* pd_tensor = new PD_Tensor(); + pd_tensor->tensor = predictor->GetOutputHandle(name); + return pd_tensor; +} + +PD_Bool PD_PredictorRun(__pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + return predictor->Run(); +} + +void PD_PredictorClearIntermediateTensor(__pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + predictor->ClearIntermediateTensor(); +} + +uint64_t PD_PredictorTryShrinkMemory(__pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + return predictor->TryShrinkMemory(); +} + +void PD_PredictorDestroy(__pd_take PD_Predictor* pd_predictor) { + delete pd_predictor; +} + +} // extern "C" diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.h b/paddle/fluid/inference/capi_exp/pd_predictor.h new file mode 100644 index 0000000000000000000000000000000000000000..d4542d0b6d394d2ebc67e6f63b0b52cefb5939b3 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_predictor.h @@ -0,0 +1,148 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// +/// \file pd_predictor.h +/// +/// \brief interface for paddle predictor +/// +/// \author paddle-infer@baidu.com +/// \date 2021-04-21 +/// \since 2.1 +/// + +#pragma once + +#include "pd_common.h" // NOLINT + +typedef struct PD_Predictor PD_Predictor; +typedef struct PD_Config PD_Config; +typedef struct PD_Tensor PD_Tensor; +typedef struct PD_OneDimArrayCstr PD_OneDimArrayCstr; + +#ifdef __cplusplus +extern "C" { +#endif + +/// +/// \brief Create a new Predictor +/// +/// \param[in] Config config +/// \return new predicor. +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_Predictor* PD_PredictorCreate( + __pd_take PD_Config* pd_config); +/// +/// \brief Clone a new Predictor +/// +/// \param[in] pd_predictor predictor +/// \return new predictor. +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_Predictor* PD_PredictorClone( + __pd_keep PD_Predictor* pd_predictor); +/// +/// \brief Get the input names +/// +/// \param[in] pd_predictor predictor +/// \return input names +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayCstr* +PD_PredictorGetInputNames(__pd_keep PD_Predictor* pd_predictor); +/// +/// \brief Get the output names +/// +/// \param[in] pd_predictor predictor +/// \return output names +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayCstr* +PD_PredictorGetOutputNames(__pd_keep PD_Predictor* pd_predictor); + +/// +/// \brief Get the input number +/// +/// \param[in] pd_predictor predictor +/// \return input number +/// +PADDLE_CAPI_EXPORT extern size_t PD_PredictorGetInputNum( + __pd_keep PD_Predictor* pd_predictor); + +/// +/// \brief Get the output number +/// +/// \param[in] pd_predictor predictor +/// \return output number +/// +PADDLE_CAPI_EXPORT extern size_t PD_PredictorGetOutputNum( + __pd_keep PD_Predictor* pd_predictor); + +/// +/// \brief Get the Input Tensor object +/// +/// \param[in] pd_predictor predictor +/// \param[in] name input name +/// \return input tensor +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_Tensor* PD_PredictorGetInputHandle( + __pd_keep PD_Predictor* pd_predictor, const char* name); + +/// +/// \brief Get the Output Tensor object +/// +/// \param[in] pd_predictor predictor +/// \param[in] name output name +/// \return output tensor +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_Tensor* PD_PredictorGetOutputHandle( + __pd_keep PD_Predictor* pd_predictor, const char* name); + +/// +/// \brief Run the prediction engine +/// +/// \param[in] pd_predictor predictor +/// \return Whether the function executed successfully +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_PredictorRun( + __pd_keep PD_Predictor* pd_predictor); + +/// \brief Clear the intermediate tensors of the predictor +/// +/// \param[in] pd_predictor predictor +/// +PADDLE_CAPI_EXPORT extern void PD_PredictorClearIntermediateTensor( + __pd_keep PD_Predictor* pd_predictor); + +/// +/// \brief Release all tmp tensor to compress the size of the memory pool. +/// The memory pool is considered to be composed of a list of chunks, if +/// the chunk is not occupied, it can be released. +/// +/// \param[in] pd_predictor predictor +/// \return Number of bytes released. It may be smaller than the actual +/// released memory, because part of the memory is not managed by the +/// MemoryPool. +/// +PADDLE_CAPI_EXPORT extern uint64_t PD_PredictorTryShrinkMemory( + __pd_keep PD_Predictor* pd_predictor); + +/// +/// \brief Destroy a predictor object +/// +/// \param[in] pd_predictor predictor +/// +PADDLE_CAPI_EXPORT extern void PD_PredictorDestroy( + __pd_take PD_Predictor* pd_predictor); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/paddle/fluid/inference/capi_exp/pd_tensor.cc b/paddle/fluid/inference/capi_exp/pd_tensor.cc new file mode 100644 index 0000000000000000000000000000000000000000..9c661dea6f2bb2dcb168e5d08e80794195ef2710 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_tensor.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/capi_exp/pd_tensor.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/capi_exp/pd_types.h" +#include "paddle/fluid/inference/capi_exp/pd_utils.h" +#include "paddle/fluid/inference/capi_exp/types_internal.h" +#include "paddle/fluid/inference/capi_exp/utils_internal.h" +#include "paddle/fluid/platform/enforce.h" + +#define CHECK_AND_CONVERT_PD_TENSOR \ + PADDLE_ENFORCE_NOT_NULL( \ + pd_tensor, paddle::platform::errors::InvalidArgument( \ + "The pointer of paddle tensor shouldn't be nullptr")); \ + auto& tensor = pd_tensor->tensor + +extern "C" { + +void PD_TensorDestroy(__pd_take PD_Tensor* pd_tensor) { delete pd_tensor; } +void PD_TensorReshape(__pd_keep PD_Tensor* pd_tensor, size_t shape_size, + int32_t* shape) { + CHECK_AND_CONVERT_PD_TENSOR; + std::vector shapes(shape_size); + for (size_t index = 0; index < shape_size; ++index) { + shapes[index] = shape[index]; + } + tensor->Reshape(shapes); +} + +#define REPEAT_ALL_DATA_TYPE(func) \ + func(float, Float) func(int64_t, Int64) func(int32_t, Int32) \ + func(uint8_t, Uint8) func(int8_t, Int8) + +#define PD_TENSOR_MUTABLE_DATA_IMPL(type, Type) \ + type* PD_TensorMutableData##Type(__pd_keep PD_Tensor* pd_tensor, \ + PD_PlaceType place) { \ + CHECK_AND_CONVERT_PD_TENSOR; \ + return tensor->mutable_data(paddle_infer::CvtToCxxPlaceType(place)); \ + } +REPEAT_ALL_DATA_TYPE(PD_TENSOR_MUTABLE_DATA_IMPL) +#undef PD_TENSOR_MUTABLE_DATA_IMPL + +#define PD_TENSOR_DATA_IMPL(type, Type) \ + type* PD_TensorData##Type(__pd_keep PD_Tensor* pd_tensor, \ + PD_PlaceType* place, int32_t* size) { \ + CHECK_AND_CONVERT_PD_TENSOR; \ + PADDLE_ENFORCE_NOT_NULL(place, \ + paddle::platform::errors::InvalidArgument( \ + "The pointer of place shouldn't be nullptr")); \ + PADDLE_ENFORCE_NOT_NULL(size, \ + paddle::platform::errors::InvalidArgument( \ + "The pointer of size shouldn't be nullptr")); \ + paddle_infer::PlaceType cxx_palce_type; \ + int cxx_size; \ + type* data = tensor->data(&cxx_palce_type, &cxx_size); \ + *place = paddle_infer::CvtFromCxxPlaceType(cxx_palce_type); \ + *size = static_cast(cxx_size); \ + return data; \ + } +REPEAT_ALL_DATA_TYPE(PD_TENSOR_DATA_IMPL) +#undef PD_TENSOR_DATA_IMPL + +#define PD_TENSOR_COPY_FROM_CPU_IMPL(type, Type) \ + void PD_TensorCopyFromCpu##Type(__pd_keep PD_Tensor* pd_tensor, \ + const type* data) { \ + CHECK_AND_CONVERT_PD_TENSOR; \ + tensor->CopyFromCpu(data); \ + } +REPEAT_ALL_DATA_TYPE(PD_TENSOR_COPY_FROM_CPU_IMPL) +#undef PD_TENSOR_COPY_FROM_CPU_IMPL + +#define PD_TENSOR_COPY_TO_CPU_IMPL(type, Type) \ + void PD_TensorCopyToCpu##Type(__pd_keep PD_Tensor* pd_tensor, type* data) { \ + CHECK_AND_CONVERT_PD_TENSOR; \ + tensor->CopyToCpu(data); \ + } +REPEAT_ALL_DATA_TYPE(PD_TENSOR_COPY_TO_CPU_IMPL) +#undef PD_TENSOR_COPY_TO_CPU_IMPL + +#undef REPEAT_ALL_DATA_TYPE + +__pd_give PD_OneDimArrayInt32* PD_TensorGetShape( + __pd_keep PD_Tensor* pd_tensor) { + CHECK_AND_CONVERT_PD_TENSOR; + return paddle_infer::CvtVecToOneDimArrayInt32(tensor->shape()); +} +void PD_TensorSetLod(__pd_keep PD_Tensor* pd_tensor, + __pd_keep PD_TwoDimArraySize* lod) { + CHECK_AND_CONVERT_PD_TENSOR; + tensor->SetLoD(paddle_infer::CvtTwoDimArrayToVecSize(lod)); +} +__pd_give PD_TwoDimArraySize* PD_TensorGetLod(__pd_keep PD_Tensor* pd_tensor) { + CHECK_AND_CONVERT_PD_TENSOR; + return paddle_infer::CvtVecToTwoDimArraySize(tensor->lod()); +} +const char* PD_TensorGetName(__pd_keep PD_Tensor* pd_tensor) { + CHECK_AND_CONVERT_PD_TENSOR; + return tensor->name().c_str(); +} +PD_DataType PD_TensorGetDataType(__pd_keep PD_Tensor* pd_tensor) { + CHECK_AND_CONVERT_PD_TENSOR; + return paddle_infer::CvtFromCxxDatatype(tensor->type()); +} + +} // extern "C" diff --git a/paddle/fluid/inference/capi_exp/pd_tensor.h b/paddle/fluid/inference/capi_exp/pd_tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..29ea4b5d62e43ccf44bb425f8c43c122c1b0f220 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_tensor.h @@ -0,0 +1,287 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// +/// \file pd_tensor.h +/// +/// \brief interface for paddle tensor +/// +/// \author paddle-infer@baidu.com +/// \date 2021-04-21 +/// \since 2.1 +/// + +#pragma once + +#include "pd_common.h" // NOLINT + +typedef struct PD_Tensor PD_Tensor; +typedef struct PD_OneDimArrayInt32 PD_OneDimArrayInt32; +typedef struct PD_TwoDimArraySize PD_TwoDimArraySize; + +#ifdef __cplusplus +extern "C" { +#endif + +/// +/// \brief Destroy the paddle tensor +/// +/// \param[in] pd_tensor tensor +/// +PADDLE_CAPI_EXPORT extern void PD_TensorDestroy(__pd_take PD_Tensor* pd_tensor); + +/// +/// \brief Reset the shape of the tensor. +/// Generally it's only used for the input tensor. +/// Reshape must be called before calling PD_TensorMutableData*() or +/// PD_TensorCopyFromCpu*() +/// +/// \param[in] pd_tensor tensor. +/// \param[in] shape_size The size of shape. +/// \param[in] shape The shape to set. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorReshape(__pd_keep PD_Tensor* pd_tensor, + size_t shape_size, + int32_t* shape); + +/// +/// \brief Get the memory pointer in CPU or GPU with 'float' data type. +/// Please Reshape the tensor first before call this. +/// It's usually used to get input data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[in] place The place of the tensor. +/// \return Memory pointer of pd_tensor +/// +PADDLE_CAPI_EXPORT extern float* PD_TensorMutableDataFloat( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place); +/// +/// \brief Get the memory pointer in CPU or GPU with 'int64_t' data type. +/// Please Reshape the tensor first before call this. +/// It's usually used to get input data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[in] place The place of the tensor. +/// \return Memory pointer of pd_tensor +/// +PADDLE_CAPI_EXPORT extern int64_t* PD_TensorMutableDataInt64( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place); +/// +/// \brief Get the memory pointer in CPU or GPU with 'int32_t' data type. +/// Please Reshape the tensor first before call this. +/// It's usually used to get input data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[in] place The place of the tensor. +/// \return Memory pointer of pd_tensor +/// +PADDLE_CAPI_EXPORT extern int32_t* PD_TensorMutableDataInt32( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place); +/// +/// \brief Get the memory pointer in CPU or GPU with 'uint8_t' data type. +/// Please Reshape the tensor first before call this. +/// It's usually used to get input data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[in] place The place of the tensor. +/// \return Memory pointer of pd_tensor +/// +PADDLE_CAPI_EXPORT extern uint8_t* PD_TensorMutableDataUint8( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place); +/// +/// \brief Get the memory pointer in CPU or GPU with 'int8_t' data type. +/// Please Reshape the tensor first before call this. +/// It's usually used to get input data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[in] place The place of the tensor. +/// \return Memory pointer of pd_tensor +/// +PADDLE_CAPI_EXPORT extern int8_t* PD_TensorMutableDataInt8( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place); +/// +/// \brief Get the memory pointer directly. +/// It's usually used to get the output data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[out] place To get the device type of the tensor. +/// \param[out] size To get the data size of the tensor. +/// \return The tensor data buffer pointer. +/// +PADDLE_CAPI_EXPORT extern float* PD_TensorDataFloat( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size); +/// +/// \brief Get the memory pointer directly. +/// It's usually used to get the output data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[out] place To get the device type of the tensor. +/// \param[out] size To get the data size of the tensor. +/// \return The tensor data buffer pointer. +/// +PADDLE_CAPI_EXPORT extern int64_t* PD_TensorDataInt64( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size); +/// +/// \brief Get the memory pointer directly. +/// It's usually used to get the output data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[out] place To get the device type of the tensor. +/// \param[out] size To get the data size of the tensor. +/// \return The tensor data buffer pointer. +/// +PADDLE_CAPI_EXPORT extern int32_t* PD_TensorDataInt32( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size); +/// +/// \brief Get the memory pointer directly. +/// It's usually used to get the output data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[out] place To get the device type of the tensor. +/// \param[out] size To get the data size of the tensor. +/// \return The tensor data buffer pointer. +/// +PADDLE_CAPI_EXPORT extern uint8_t* PD_TensorDataUint8( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size); +/// +/// \brief Get the memory pointer directly. +/// It's usually used to get the output data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[out] place To get the device type of the tensor. +/// \param[out] size To get the data size of the tensor. +/// \return The tensor data buffer pointer. +/// +PADDLE_CAPI_EXPORT extern int8_t* PD_TensorDataInt8( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size); +/// +/// \brief Copy the host memory to tensor data. +/// It's usually used to set the input tensor data. +/// \param[in] pd_tensor tensor. +/// \param[in] data The pointer of the data, from which the tensor will copy. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuFloat( + __pd_keep PD_Tensor* pd_tensor, const float* data); +/// +/// \brief Copy the host memory to tensor data. +/// It's usually used to set the input tensor data. +/// \param[in] pd_tensor tensor. +/// \param[in] data The pointer of the data, from which the tensor will copy. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt64( + __pd_keep PD_Tensor* pd_tensor, const int64_t* data); +/// +/// \brief Copy the host memory to tensor data. +/// It's usually used to set the input tensor data. +/// \param[in] pd_tensor tensor. +/// \param[in] data The pointer of the data, from which the tensor will copy. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt32( + __pd_keep PD_Tensor* pd_tensor, const int32_t* data); +/// +/// \brief Copy the host memory to tensor data. +/// It's usually used to set the input tensor data. +/// \param[in] pd_tensor tensor. +/// \param[in] data The pointer of the data, from which the tensor will copy. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuUint8( + __pd_keep PD_Tensor* pd_tensor, const uint8_t* data); +/// +/// \brief Copy the host memory to tensor data. +/// It's usually used to set the input tensor data. +/// \param[in] pd_tensor tensor. +/// \param[in] data The pointer of the data, from which the tensor will copy. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt8( + __pd_keep PD_Tensor* pd_tensor, const int8_t* data); +/// +/// \brief Copy the tensor data to the host memory. +/// It's usually used to get the output tensor data. +/// \param[in] pd_tensor tensor. +/// \param[out] data The tensor will copy the data to the address. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuFloat( + __pd_keep PD_Tensor* pd_tensor, float* data); +/// +/// \brief Copy the tensor data to the host memory. +/// It's usually used to get the output tensor data. +/// \param[in] pd_tensor tensor. +/// \param[out] data The tensor will copy the data to the address. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt64( + __pd_keep PD_Tensor* pd_tensor, int64_t* data); +/// +/// \brief Copy the tensor data to the host memory. +/// It's usually used to get the output tensor data. +/// \param[in] pd_tensor tensor. +/// \param[out] data The tensor will copy the data to the address. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt32( + __pd_keep PD_Tensor* pd_tensor, int32_t* data); +/// +/// \brief Copy the tensor data to the host memory. +/// It's usually used to get the output tensor data. +/// \param[in] pd_tensor tensor. +/// \param[out] data The tensor will copy the data to the address. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuUint8( + __pd_keep PD_Tensor* pd_tensor, uint8_t* data); +/// +/// \brief Copy the tensor data to the host memory. +/// It's usually used to get the output tensor data. +/// \param[in] pd_tensor tensor. +/// \param[out] data The tensor will copy the data to the address. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt8( + __pd_keep PD_Tensor* pd_tensor, int8_t* data); +/// +/// \brief Get the tensor shape +/// \param[in] pd_tensor tensor. +/// \return The tensor shape. +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayInt32* PD_TensorGetShape( + __pd_keep PD_Tensor* pd_tensor); + +/// +/// \brief Set the tensor lod information +/// \param[in] pd_tensor tensor. +/// \param[in] lod lod information. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorSetLod( + __pd_keep PD_Tensor* pd_tensor, __pd_keep PD_TwoDimArraySize* lod); +/// +/// \brief Get the tensor lod information +/// \param[in] pd_tensor tensor. +/// \return the lod information. +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_TwoDimArraySize* PD_TensorGetLod( + __pd_keep PD_Tensor* pd_tensor); +/// +/// \brief Get the tensor name +/// \param[in] pd_tensor tensor. +/// \return the tensor name. +/// +PADDLE_CAPI_EXPORT extern const char* PD_TensorGetName( + __pd_keep PD_Tensor* pd_tensor); +/// +/// \brief Get the tensor data type +/// \param[in] pd_tensor tensor. +/// \return the tensor data type. +/// +PADDLE_CAPI_EXPORT extern PD_DataType PD_TensorGetDataType( + __pd_keep PD_Tensor* pd_tensor); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/paddle/fluid/inference/capi_exp/pd_types.h b/paddle/fluid/inference/capi_exp/pd_types.h new file mode 100644 index 0000000000000000000000000000000000000000..a5da2913a9b20719346c426770bd9b40d779ffd0 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_types.h @@ -0,0 +1,40 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "pd_common.h" // NOLINT + +typedef struct PD_OneDimArrayInt32 { + size_t size; + int32_t* data; +} PD_OneDimArrayInt32; // std::vector + +typedef struct PD_OneDimArraySize { + size_t size; + size_t* data; +} PD_OneDimArraySize; // std::vector + +typedef struct PD_OneDimArrayCstr { + size_t size; + char** data; +} PD_OneDimArrayCstr; // std::vector + +typedef struct PD_TwoDimArraySize { + size_t size; + PD_OneDimArraySize** data; +} PD_TwoDimArraySize; // std::vector> diff --git a/paddle/fluid/inference/capi_exp/pd_utils.cc b/paddle/fluid/inference/capi_exp/pd_utils.cc new file mode 100644 index 0000000000000000000000000000000000000000..2e762619f5567c3fce05272815f9a8a0f17d267c --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_utils.cc @@ -0,0 +1,221 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/capi_exp/pd_utils.h" +#include "paddle/fluid/inference/capi_exp/utils_internal.h" +#include "paddle/fluid/platform/enforce.h" + +#define DESTROY_ONE_DIM_ARRAY(type) \ + void PD_OneDimArray##type##Destroy(__pd_take PD_OneDimArray##type* array) { \ + if (array != NULL) { \ + delete[] array->data; \ + delete array; \ + } \ + } +#define CONVERT_VEC_TO_ONE_DIM_ARRAY(type, Type, vec_type) \ + __pd_give PD_OneDimArray##Type* CvtVecToOneDimArray##Type( \ + const std::vector& vec) { \ + PD_OneDimArray##Type* array = new PD_OneDimArray##Type; \ + array->size = vec.size(); \ + array->data = vec.empty() ? NULL : new type[vec.size()]; \ + for (size_t index = 0; index < vec.size(); ++index) { \ + array->data[index] = vec[index]; \ + } \ + return array; \ + } +#define CONVERT_ONE_DIM_ARRAY_TO_VEC(type, Type, vec_type) \ + std::vector CvtOneDimArrayToVec##Type( \ + __pd_keep const PD_OneDimArray##Type* array) { \ + std::vector vec; \ + if (array != NULL) { \ + vec.resize(array->size); \ + for (size_t index = 0; index < array->size; ++index) { \ + vec[index] = array->data[index]; \ + } \ + } \ + return vec; \ + } + +#define ONE_DIM_ARRAY_UTILS_FUNC_IMPL(type, Type, vec_type) \ + extern "C" { \ + DESTROY_ONE_DIM_ARRAY(Type); \ + } \ + namespace paddle_infer { \ + CONVERT_VEC_TO_ONE_DIM_ARRAY(type, Type, vec_type) \ + CONVERT_ONE_DIM_ARRAY_TO_VEC(type, Type, vec_type) \ + } + +ONE_DIM_ARRAY_UTILS_FUNC_IMPL(int32_t, Int32, int) +ONE_DIM_ARRAY_UTILS_FUNC_IMPL(size_t, Size, size_t) + +#undef ONE_DIM_ARRAY_UTILS_FUNC_IMPL +#undef CONVERT_ONE_DIM_ARRAY_TO_VEC +#undef CONVERT_VEC_TO_ONE_DIM_ARRAY +#undef DESTROY_ONE_DIM_ARRAY + +void PD_OneDimArrayCstrDestroy(__pd_take PD_OneDimArrayCstr* array) { + if (array != NULL) { + if (array->size != 0) { + for (size_t index = 0; index < array->size; ++index) { + delete[] array->data[index]; + } + } + delete[] array->data; + delete array; + } +} +namespace paddle_infer { + +__pd_give PD_OneDimArrayCstr* CvtVecToOneDimArrayCstr( + const std::vector& vec) { + PD_OneDimArrayCstr* array = new PD_OneDimArrayCstr; + array->size = vec.size(); + array->data = vec.empty() ? NULL : new char*[vec.size()]; + for (size_t index = 0u; index < vec.size(); ++index) { + array->data[index] = new char[vec[index].size() + 1]; + memcpy(array->data[index], vec[index].c_str(), vec[index].size() + 1); + } + return array; +} + +std::vector CvtOneDimArrayToVecCstr( + __pd_keep const PD_OneDimArrayCstr* array) { + std::vector vec; + for (size_t index = 0; index < array->size; ++index) { + vec.emplace_back(array->data[index]); + } + return vec; +} + +} // namespace paddle_infer + +#define DESTROY_TWO_DIM_ARRAY(type) \ + void PD_TwoDimArray##type##Destroy(__pd_take PD_TwoDimArray##type* array) { \ + if (array != NULL) { \ + if (array->size != 0) { \ + for (size_t index = 0; index < array->size; ++index) { \ + PD_OneDimArray##type##Destroy(array->data[index]); \ + } \ + } \ + delete[] array->data; \ + delete array; \ + } \ + } +#define CONVERT_VEC_TO_TWO_DIM_ARRAY(type, Type, vec_type) \ + __pd_give PD_TwoDimArray##Type* CvtVecToTwoDimArray##Type( \ + const std::vector>& vec) { \ + PD_TwoDimArray##Type* array = new PD_TwoDimArray##Type; \ + array->size = vec.size(); \ + array->data = vec.empty() ? NULL : new PD_OneDimArray##Type*[vec.size()]; \ + for (size_t index = 0; index < vec.size(); ++index) { \ + array->data[index] = CvtVecToOneDimArray##Type(vec[index]); \ + } \ + return array; \ + } +#define CONVERT_TWO_DIM_ARRAY_TO_VEC(type, Type, vec_type) \ + std::vector> CvtTwoDimArrayToVec##Type( \ + __pd_keep const PD_TwoDimArray##Type* array) { \ + std::vector> vec; \ + if (array != NULL && array->size != 0) { \ + vec.resize(array->size); \ + for (size_t index = 0; index < array->size; ++index) { \ + vec[index] = CvtOneDimArrayToVec##Type((array->data)[index]); \ + } \ + } \ + return vec; \ + } +#define TWO_DIM_ARRAY_UTILS_FUNC_IMPL(type, Type, vec_type) \ + extern "C" { \ + DESTROY_TWO_DIM_ARRAY(Type); \ + } \ + namespace paddle_infer { \ + CONVERT_VEC_TO_TWO_DIM_ARRAY(type, Type, vec_type) \ + CONVERT_TWO_DIM_ARRAY_TO_VEC(type, Type, vec_type) \ + } + +TWO_DIM_ARRAY_UTILS_FUNC_IMPL(size_t, Size, size_t) + +#undef TWO_DIM_ARRAY_UTILS_FUNC_IMPL +#undef CONVERT_TWO_DIM_ARRAY_TO_VEC +#undef CONVERT_VEC_TO_TWO_DIM_ARRAY +#undef DESTROY_TWO_DIM_ARRAY + +namespace paddle_infer { + +PlaceType CvtToCxxPlaceType(PD_PlaceType place_type) { + switch (place_type) { + case PD_PLACE_UNK: + return PlaceType::kUNK; + case PD_PLACE_CPU: + return PlaceType::kCPU; + case PD_PLACE_GPU: + return PlaceType::kGPU; + case PD_PLACE_XPU: + return PlaceType::kXPU; + default: + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Unsupport paddle place type %d.", place_type)); + return PlaceType::kUNK; + } +} + +PD_PlaceType CvtFromCxxPlaceType(PlaceType place_type) { + switch (place_type) { + case PlaceType::kCPU: + return PD_PLACE_CPU; + case PlaceType::kGPU: + return PD_PLACE_GPU; + case PlaceType::kXPU: + return PD_PLACE_XPU; + default: + return PD_PLACE_UNK; + } +} + +DataType CvtToCxxDatatype(PD_DataType data_type) { + switch (data_type) { + case PD_DATA_FLOAT32: + return DataType::FLOAT32; + case PD_DATA_INT64: + return DataType::INT64; + case PD_DATA_INT32: + return DataType::INT32; + case PD_DATA_UINT8: + return DataType::UINT8; + default: + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Unsupport paddle data type %d.", data_type)); + return DataType::FLOAT32; + } +} + +PD_DataType CvtFromCxxDatatype(DataType data_type) { + switch (data_type) { + case DataType::FLOAT32: + return PD_DATA_FLOAT32; + case DataType::INT64: + return PD_DATA_INT64; + case DataType::INT32: + return PD_DATA_INT32; + case DataType::UINT8: + return PD_DATA_UINT8; + default: + return PD_DATA_UNK; + } +} + +} // namespace paddle_infer diff --git a/paddle/fluid/inference/capi_exp/pd_utils.h b/paddle/fluid/inference/capi_exp/pd_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..68e519d4bb5e959dd618b5aa31d090c7a74dc2a7 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_utils.h @@ -0,0 +1,70 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// +/// \file pd_utils.h +/// +/// \brief Some utility function to destroy paddle struct. +/// +/// \author paddle-infer@baidu.com +/// \date 2021-04-21 +/// \since 2.1 +/// + +#pragma once + +#include +#include + +#include "pd_types.h" // NOLINT + +#ifdef __cplusplus +extern "C" { +#endif + +/// +/// \brief Destroy the PD_OneDimArrayInt32 object pointed to by the pointer. +/// +/// \param[in] array pointer to the PD_OneDimArrayInt32 object. +/// +PADDLE_CAPI_EXPORT extern void PD_OneDimArrayInt32Destroy( + __pd_take PD_OneDimArrayInt32* array); + +/// +/// \brief Destroy the PD_OneDimArrayCstr object pointed to by the pointer. +/// +/// \param[in] array pointer to the PD_OneDimArrayCstr object. +/// +PADDLE_CAPI_EXPORT extern void PD_OneDimArrayCstrDestroy( + __pd_take PD_OneDimArrayCstr* array); + +/// +/// \brief Destroy the PD_OneDimArraySize object pointed to by the pointer. +/// +/// \param[in] array pointer to the PD_OneDimArraySize object. +/// +PADDLE_CAPI_EXPORT extern void PD_OneDimArraySizeDestroy( + __pd_take PD_OneDimArraySize* array); + +/// +/// \brief Destroy the PD_TwoDimArraySize object pointed to by the pointer. +/// +/// \param[in] array pointer to the PD_TwoDimArraySize object. +/// +PADDLE_CAPI_EXPORT extern void PD_TwoDimArraySizeDestroy( + __pd_take PD_TwoDimArraySize* array); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/paddle/fluid/inference/capi_exp/types_internal.h b/paddle/fluid/inference/capi_exp/types_internal.h new file mode 100644 index 0000000000000000000000000000000000000000..8a61b9a884c3bfe7c3b6cc3734536a36872f74d0 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/types_internal.h @@ -0,0 +1,29 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/capi_exp/pd_common.h" + +typedef struct PD_Tensor { + std::unique_ptr tensor; +} PD_Tensor; + +typedef struct PD_Predictor { + std::shared_ptr predictor; +} PD_Predictor; diff --git a/paddle/fluid/inference/capi_exp/utils_internal.h b/paddle/fluid/inference/capi_exp/utils_internal.h new file mode 100644 index 0000000000000000000000000000000000000000..fbae512ecd85578cf236100ba1e0b00a6c18775f --- /dev/null +++ b/paddle/fluid/inference/capi_exp/utils_internal.h @@ -0,0 +1,153 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// +/// \file utils_internal.h +/// +/// \brief Some utility function used to convert object between C Struct and C++ +/// Class. +/// +/// \author paddle-infer@baidu.com +/// \date 2021-04-21 +/// \since 2.1 +/// + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/capi_exp/pd_types.h" + +namespace paddle_infer { + +/// +/// \brief Convert the 'std::vector' object to a 'PD_OneDimArrayInt32' +/// object. +/// +/// \param[in] vec source object. +/// \return target object. +/// +__pd_give PD_OneDimArrayInt32* CvtVecToOneDimArrayInt32( + const std::vector& vec); + +/// +/// \brief Convert the 'PD_OneDimArrayInt32' object to a 'std::vector' +/// object. +/// +/// \param[in] array source object. +/// \return target object. +/// +std::vector CvtOneDimArrayToVecInt32( + __pd_keep const PD_OneDimArrayInt32* array); + +/// +/// \brief Convert the 'std::vector' object to a 'PD_OneDimArraySize' +/// object. +/// +/// \param[in] vec source object. +/// \return target object. +/// +__pd_give PD_OneDimArraySize* CvtVecToOneDimArraySize( + const std::vector& vec); + +/// +/// \brief Convert the 'PD_OneDimArraySize' object to a 'std::vector' +/// object. +/// +/// \param[in] array source object. +/// \return target object. +/// +std::vector CvtOneDimArrayToVecSize( + __pd_keep const PD_OneDimArraySize* array); + +/// +/// \brief Convert the 'std::vector' object to a +/// 'PD_OneDimArrayCstr' object. +/// +/// \param[in] vec source object. +/// \return target object. +/// +__pd_give PD_OneDimArrayCstr* CvtVecToOneDimArrayCstr( + const std::vector& vec); + +/// +/// \brief Convert the 'PD_OneDimArrayCstr' object to a +/// 'std::vector' object. +/// +/// \param[in] array source object. +/// \return target object. +/// +std::vector CvtOneDimArrayToVecCstr( + __pd_keep const PD_OneDimArrayCstr* array); + +/// +/// \brief Convert the 'std::vector>' object to a +/// 'PD_TwoDimArraySize' object. +/// +/// \param[in] vec source object. +/// \return target object. +/// +__pd_give PD_TwoDimArraySize* CvtVecToTwoDimArraySize( + const std::vector>& vec); + +/// +/// \brief Convert the 'PD_TwoDimArraySize' object to a +/// 'std::vector>' object. +/// +/// \param[in] array source object. +/// \return target object. +/// +std::vector> CvtTwoDimArrayToVecSize( + __pd_keep const PD_TwoDimArraySize* array); + +/// +/// \brief Convert the 'PD_PlaceType' object to a 'paddle_infer::PlaceType' +/// object. +/// +/// \param[in] place_type source object. +/// \return target object. +/// +PlaceType CvtToCxxPlaceType(PD_PlaceType place_type); + +/// +/// \brief Convert the 'paddle_infer::PlaceType' object to a 'PD_PlaceType' +/// object. +/// +/// \param[in] place_type source object. +/// \return target object. +/// +PD_PlaceType CvtFromCxxPlaceType(PlaceType place_type); + +/// +/// \brief Convert the 'PD_DataType' object to a 'paddle_infer::DataType' +/// object. +/// +/// \param[in] place_type source object. +/// \return target object. +/// +DataType CvtToCxxDatatype(PD_DataType data_type); + +/// +/// \brief Convert the 'paddle_infer::DataType' object to a 'PD_DataType' +/// object. +/// +/// \param[in] place_type source object. +/// \return target object. +/// +PD_DataType CvtFromCxxDatatype(DataType data_type); + +} // namespace paddle_infer diff --git a/paddle/fluid/inference/engine.h b/paddle/fluid/inference/engine.h index 1a13ba510384c010e476bf0ba0ad5b0ba84d3240..e29162cf5b23bacafcf2e5ef600a96ed4518c360 100644 --- a/paddle/fluid/inference/engine.h +++ b/paddle/fluid/inference/engine.h @@ -15,7 +15,6 @@ limitations under the License. */ #pragma once #include -#include "paddle/fluid/framework/framework.pb.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc index 59a786e46c98bf5972f23bd6148712eccc198aa6..908e1ab990bb73b124158f66cd0413a4b6a20907 100644 --- a/paddle/fluid/inference/lite/engine.cc +++ b/paddle/fluid/inference/lite/engine.cc @@ -59,8 +59,14 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create( #endif #ifdef LITE_SUBGRAPH_WITH_XPU + // Deprecated in Paddle-Lite release/v2.8 lite_cxx_config.set_xpu_workspace_l3_size_per_thread( cfg.xpu_l3_workspace_size); + lite_cxx_config.set_xpu_l3_cache_method(cfg.xpu_l3_workspace_size, + cfg.locked); + lite_cxx_config.set_xpu_conv_autotune(cfg.autotune, cfg.autotune_file); + lite_cxx_config.set_xpu_multi_encoder_method(cfg.precision, + cfg.adaptive_seqlen); #endif // create predictor diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h index 5ba487cc24d7d58cd87853a58fc12f1a82c3610d..a64ef1eda828bf2a5fc96c1cc8435c0a4b6912c6 100644 --- a/paddle/fluid/inference/lite/engine.h +++ b/paddle/fluid/inference/lite/engine.h @@ -42,6 +42,11 @@ struct EngineConfig { // for xpu size_t xpu_l3_workspace_size; + bool locked = false; + bool autotune = true; + std::string autotune_file = ""; + std::string precision = "int16"; + bool adaptive_seqlen = false; // for x86 or arm int cpu_math_library_num_threads{1}; diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index f9586ca1701f74f140e6a78b8758a76c1739a54a..3820ac5d7cc24693c388554acea0aad6ab49b83a 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -5,6 +5,13 @@ nv_library(tensorrt_converter pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc + gather_op.cc + anchor_generator_op.cc + yolo_box_op.cc + roi_align_op.cc + affine_channel_op.cc + multiclass_nms_op.cc + nearest_interp_op.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..813342c08483b7e9124929d3f00d8155d337e67e --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc @@ -0,0 +1,94 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Affine Channel Op + */ +class AffineChannelOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid affine_channel op to tensorrt scale nd layer"; + + framework::OpDesc op_desc(op, nullptr); + std::string input_name = op_desc.Input("X").front(); + std::string scale_name = op_desc.Input("Scale").front(); + std::string bias_name = op_desc.Input("Bias").front(); + std::string output_name = op_desc.Output("Out").front(); + + auto input_tensor = engine_->GetITensor(input_name); + auto idim = input_tensor->getDimensions(); + + auto* scale_v = scope.FindVar(scale_name); + auto* scale_t = scale_v->GetMutable(); + float* scale_ptr = engine_->GetWeightCPUData(scale_name, scale_t, false); + + auto* bias_v = scope.FindVar(bias_name); + auto* bias_t = bias_v->GetMutable(); + float* bias_ptr = engine_->GetWeightCPUData(bias_name, bias_t, false); + + auto data_layout = framework::StringToDataLayout( + BOOST_GET_CONST(std::string, op_desc.GetAttr("data_layout"))); + + PADDLE_ENFORCE_EQ( + data_layout, framework::DataLayout::kNCHW, + platform::errors::InvalidArgument( + "TensorRT affine channel converter can only convert NCHW format. " + "Other format should be run in fluid mode. Report a bug on github " + "issue if you see this line.")); + + // tensorrt scalend layer only support spatial dims >= 2, + // so nhwc is not availabe (spatial dims == 0) + const int channel_axis = engine_->with_dynamic_shape(); + + TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, + static_cast(scale_ptr), + (size_t)idim.d[channel_axis]}; + TensorRTEngine::Weight bias_weights{nvinfer1::DataType::kFLOAT, + static_cast(bias_ptr), + (size_t)idim.d[channel_axis]}; + TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, + 0}; + + auto layer = TRT_ENGINE_ADD_LAYER(engine_, ScaleNd, *input_tensor, + nvinfer1::ScaleMode::kCHANNEL, + bias_weights.get(), scale_weights.get(), + power_weights.get(), channel_axis); + + RreplenishLayerAndOutput(layer, "affine_channel", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(affine_channel, AffineChannelOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc b/paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..56aab9785c90f37e170e204b20f8f00a09941018 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc @@ -0,0 +1,79 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* Anchor Generator Op */ +class AnchorGeneratorOpConverter : public OpConverter { + public: + void operator()(const paddle::framework::proto::OpDesc& op, + const paddle::framework::Scope& scope, + bool test_mode) override { + VLOG(3) << "convert a fluid anchor generator op to tensorrt plugin"; + framework::OpDesc op_desc(op, nullptr); + std::string input_name = op_desc.Input("Input").front(); + std::string anchor_name = op_desc.Output("Anchors").front(); + std::string variance_name = op_desc.Output("Variances").front(); + + auto* input = engine_->GetITensor(input_name); + const auto input_dims = input->getDimensions(); // C, H, W + std::vector output_names{anchor_name, variance_name}; + + const auto anchor_sizes = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("anchor_sizes")); + const auto aspect_ratios = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("aspect_ratios")); + const auto stride = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("stride")); + const auto variances = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("variances")); + const auto offset = BOOST_GET_CONST(float, op_desc.GetAttr("offset")); + const int num_anchors = aspect_ratios.size() * anchor_sizes.size(); + bool is_dynamic = engine_->with_dynamic_shape(); + const auto height = input_dims.d[1]; + const auto width = input_dims.d[2]; + const int box_num = width * height * num_anchors; + const nvinfer1::DataType data_type = nvinfer1::DataType::kFLOAT; + + nvinfer1::IPluginV2* anchor_generator_plugin = nullptr; + if (is_dynamic) { + anchor_generator_plugin = new plugin::AnchorGeneratorPluginDynamic( + data_type, anchor_sizes, aspect_ratios, stride, variances, offset, + num_anchors); + } else { + anchor_generator_plugin = new plugin::AnchorGeneratorPlugin( + data_type, anchor_sizes, aspect_ratios, stride, variances, offset, + height, width, num_anchors, box_num); + } + + std::vector anchor_generator_inputs{input}; + auto* anchor_generator_layer = engine_->network()->addPluginV2( + anchor_generator_inputs.data(), anchor_generator_inputs.size(), + *anchor_generator_plugin); + + RreplenishLayerAndOutput(anchor_generator_layer, "anchor_generator", + output_names, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(anchor_generator, AnchorGeneratorOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc index 26cd7b22d2baaacea65a800b456e69d28955699f..7ea41839cb939ff3a1a7b3c6921b6e014bcdc1b6 100644 --- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc @@ -38,38 +38,6 @@ class BatchNormOpConverter : public OpConverter { VLOG(3) << "convert a fluid batch norm op to tensorrt batch_norm"; framework::OpDesc op_desc(op, nullptr); - PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1, - platform::errors::InvalidArgument( - "Invalid input X's size of batch_norm TRT converter. " - "Expected 1, received %d.", - op_desc.Input("X").size())); - PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1, - platform::errors::InvalidArgument( - "Invalid input Bias's size of batch_norm TRT " - "converter. Expected 1, received %d.", - op_desc.Input("Bias").size())); // Bias is a weight - PADDLE_ENFORCE_EQ(op_desc.Input("Mean").size(), 1, - platform::errors::InvalidArgument( - "Invalid input Mean's size of batch_norm TRT " - "converter. Expected 1, received %d.", - op_desc.Input("Mean").size())); // Mean is a weight - PADDLE_ENFORCE_EQ(op_desc.Input("Scale").size(), 1, - platform::errors::InvalidArgument( - "Invalid input Scale's size of batch_norm TRT " - "converter. Expected 1, received %d.", - op_desc.Input("Scale").size())); // Scale is a weight - PADDLE_ENFORCE_EQ( - op_desc.Input("Variance").size(), 1, - platform::errors::InvalidArgument( - "Invalid input Variance's size of batch_norm TRT converter. " - "Expected 1, received %d.", - op_desc.Input("Variance").size())); // Variance is a weight - PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1, - platform::errors::InvalidArgument( - "Invalid output Y's size of batch_norm TRT " - "converter. Expected 1, received %d.", - op_desc.Output("Y").size())); - auto* X = engine_->GetITensor(op_desc.Input("X").front()); // Declare weights auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front()); @@ -158,17 +126,49 @@ class BatchNormOpConverter : public OpConverter { TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, 0}; - nvinfer1::IScaleLayer* layer = - TRT_ENGINE_ADD_LAYER(engine_, Scale, *const_cast(X), - nvinfer1::ScaleMode::kCHANNEL, shift_weights.get(), - scale_weights.get(), power_weights.get()); + int dynamic_shape_offset = engine_->with_dynamic_shape() ? 1 : 0; + nvinfer1::ILayer* layer = nullptr; + nvinfer1::IShuffleLayer* expand_layer = nullptr; + nvinfer1::IShuffleLayer* squeeze_layer = nullptr; + + auto x_dim = X->getDimensions(); + if (x_dim.nbDims < 3 + dynamic_shape_offset) { + nvinfer1::Dims expand_shape; + expand_shape.nbDims = 3 + dynamic_shape_offset; + for (int i = 0; i < 3 + dynamic_shape_offset; i++) { + if (i < x_dim.nbDims) { + expand_shape.d[i] = x_dim.d[i] < 0 ? 0 : x_dim.d[i]; + } else { + expand_shape.d[i] = 1; + } + } + expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); + expand_layer->setReshapeDimensions(expand_shape); + X = expand_layer->getOutput(0); + } + + layer = TRT_ENGINE_ADD_LAYER( + engine_, Scale, *X, nvinfer1::ScaleMode::kCHANNEL, shift_weights.get(), + scale_weights.get(), power_weights.get()); auto output_name = op_desc.Output("Y").front(); engine_->SetWeights(op_desc.Input("Bias").front(), std::move(combile_bias_tensor)); engine_->SetWeights(op_desc.Input("Scale").front(), std::move(combile_scale_tensor)); - RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode); + if (x_dim.nbDims < 3 + dynamic_shape_offset) { + nvinfer1::Dims squeeze_shape; + squeeze_shape.nbDims = x_dim.nbDims; + for (int i = 0; i < squeeze_shape.nbDims; i++) { + squeeze_shape.d[i] = x_dim.d[i] < 0 ? 0 : x_dim.d[i]; + } + squeeze_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0))); + squeeze_layer->setReshapeDimensions(squeeze_shape); + layer = static_cast(squeeze_layer); + } + RreplenishLayerAndOutput(layer, "batchnorm_add_scale", {output_name}, + test_mode); } }; diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 5515cd35daedc70b6ecad44f4295084546386b96..61199724bcfe30dfcfc0e044a54e49b62d3a0936 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -36,18 +36,6 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, VLOG(3) << "convert a fluid " << name << " op to tensorrt layer without bias"; framework::OpDesc op_desc(op, nullptr); - PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL, - platform::errors::InvalidArgument( - "TRT Conv2d expect 1 input, but got %d input.", - op_desc.Input("Input").size())); - PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL, - platform::errors::InvalidArgument( - "TRT Conv2d expect 1 filter, but got %d filter.", - op_desc.Input("Filter").size())); - PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL, - platform::errors::InvalidArgument( - "TRT Conv2d expect 1 output, but got %d output.", - op_desc.Output("Output").size())); auto* X = engine->GetITensor(op_desc.Input("Input").front()); std::string filter_var_name = op_desc.Input("Filter").front(); @@ -61,13 +49,6 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, if (enable_int8) { #if IS_TRT_VERSION_GE(5000) - if (op_desc.Type() != "conv2d_transpose") { - PADDLE_ENFORCE_EQ( - op_desc.HasAttr("Input_scale"), true, - platform::errors::InvalidArgument("Input scale not found. TRT int8" - " requires conv/deconv to have " - "input quantization scales.")); - } float in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127; auto weight_scale = @@ -179,19 +160,11 @@ class Deconv2dOpConverter : public OpConverter { nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight, TensorRTEngine::Weight& bias) -> nvinfer1::IDeconvolutionLayer* { auto* layer = - TRT_ENGINE_ADD_LAYER(engine_, Deconvolution, *inputs, n_input, + TRT_ENGINE_ADD_LAYER(engine_, Deconvolution, *inputs, n_output, ksize, weight.get(), bias.get()); return layer; }, [](nvinfer1::IDeconvolutionLayer* layer, nvinfer1::DimsHW& dilations) { - // In trt Deconv, dilation should be 1, ohter values are not - // supported. - bool condition = (dilations.d[0] == 1 && dilations.d[1] == 1); - PADDLE_ENFORCE_EQ(condition, true, - platform::errors::InvalidArgument( - "In Deconv, Dilations must be (1, 1) for " - "tensorRT, but given (%d, %d)", - dilations.d[0], dilations.d[1])); }, "conv2d_transpose"); } diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index dfadb28a6520f983986263b38be69fa48335d485..47f5cc97d39cdf785bdbcbc468714f3fe0357209 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -25,6 +25,10 @@ static bool CheckDims(const nvinfer1::Dims& dims_x, return false; } for (int i = 0; i < dims_x.nbDims; i++) { + // conservative judgment + if (dims_x.d[i] == -1 || dims_y.d[i] == -1) { + return false; + } if (dims_x.d[i] != dims_y.d[i]) { return false; } @@ -43,25 +47,6 @@ class ElementwiseWeightOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer"; - PADDLE_ENFORCE_EQ( - op_desc.Input("X").size(), 1, - platform::errors::InvalidArgument( - "The input op's Input(\"X\").size() " - "should equal to 1, but received Input(\"X\").size() = %u.", - op_desc.Input("X").size())); - PADDLE_ENFORCE_EQ( - op_desc.Input("Y").size(), 1, - platform::errors::InvalidArgument( - "The input op's Input(\"Y\").size() " - "should equal to 1, but received Input(\"Y\").size() = %u.", - op_desc.Input("Y").size())); // Y is a weight - PADDLE_ENFORCE_EQ( - op_desc.Output("Out").size(), 1, - platform::errors::InvalidArgument( - "The input op's Output(\"Out\").size() " - "should equal to 1, but reveceid Output(\"Out\").size() = %u.", - op_desc.Output("Out").size())); - auto* X = engine_->GetITensor(op_desc.Input("X").front()); auto* Y_v = scope.FindVar(op_desc.Input("Y").front()); PADDLE_ENFORCE_NOT_NULL( @@ -81,6 +66,25 @@ class ElementwiseWeightOpConverter : public OpConverter { 0}; TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, 0}; + + nvinfer1::IShuffleLayer* expand_layer = nullptr; + nvinfer1::IShuffleLayer* squeeze_layer = nullptr; + int dynamic_shape_offset = engine_->with_dynamic_shape() ? 1 : 0; + auto input_dim = X->getDimensions(); + if (input_dim.nbDims < 3 + dynamic_shape_offset) { + nvinfer1::Dims expand_shape; + expand_shape.nbDims = 3 + dynamic_shape_offset; + for (int i = 0; i < expand_shape.nbDims; i++) { + if (i < input_dim.nbDims) { + expand_shape.d[i] = input_dim.d[i] < 0 ? 0 : input_dim.d[i]; + } else { + expand_shape.d[i] = 1; + } + } + expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); + expand_layer->setReshapeDimensions(expand_shape); + X = expand_layer->getOutput(0); + } if (op_type_ == "add") { nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER( engine_, Scale, *X, scale_mode, shift_weights.get(), @@ -92,7 +96,17 @@ class ElementwiseWeightOpConverter : public OpConverter { shift_weights.get(), power_weights.get()); layer = scale_layer; } - + if (input_dim.nbDims < 3 + dynamic_shape_offset) { + nvinfer1::Dims squeeze_shape; + squeeze_shape.nbDims = input_dim.nbDims; + for (int i = 0; i < squeeze_shape.nbDims; i++) { + squeeze_shape.d[i] = input_dim.d[i] < 0 ? 0 : input_dim.d[i]; + } + squeeze_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0))); + squeeze_layer->setReshapeDimensions(squeeze_shape); + layer = static_cast(squeeze_layer); + } auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(layer, "elementwise_" + op_type_, {output_name}, test_mode); @@ -193,25 +207,6 @@ class ElementwiseTensorOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); nvinfer1::ILayer* layer = nullptr; - PADDLE_ENFORCE_EQ( - op_desc.Input("X").size(), 1, - platform::errors::InvalidArgument( - "The input op's Input(\"X\").size() " - "should equal to 1, but received Input(\"X\").size() = %u.", - op_desc.Input("X").size())); - PADDLE_ENFORCE_EQ( - op_desc.Input("Y").size(), 1, - platform::errors::InvalidArgument( - "The input op's Input(\"Y\").size() " - "should equal to 1, but received Input(\"Y\").size() = %u.", - op_desc.Input("Y").size())); // Y is a weight - PADDLE_ENFORCE_EQ( - op_desc.Output("Out").size(), 1, - platform::errors::InvalidArgument( - "The input op's Output(\"Out\").size() " - "should equal to 1, but received Output(\"Out\").size() = %u.", - op_desc.Output("Out").size())); - auto* X = engine_->GetITensor(op_desc.Input("X").front()); auto* Y = engine_->GetITensor(op_desc.Input("Y").front()); std::vector itensors; @@ -251,7 +246,7 @@ class ElementwiseTensorOpConverter : public OpConverter { #if IS_TRT_VERSION_GE(6000) plugin::ElementwisePluginDynamic* plugin = new plugin::ElementwisePluginDynamic(op_type_, axis); - layer = engine_->AddPluginV2(itensors.data(), 2, plugin); + layer = engine_->AddDynamicPlugin(itensors.data(), 2, plugin); #else PADDLE_THROW(platform::errors::Fatal( "You are running the TRT Dynamic Shape mode, need to confirm that " diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc index 7f8843a3f67d05465788132ac85257dcdf3c322c..66a682db07b91195046d3d11031b8739b72b81c4 100644 --- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc @@ -31,16 +31,20 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { #if IS_TRT_VERSION_GE(6000) - VLOG(4) << "convert fluid swish op to tensorrt layer"; + VLOG(4) << "convert fluid EmbEltwiseLayerNorm op to tensorrt layer"; framework::OpDesc op_desc(op, nullptr); - auto id_names = op_desc.Input("Ids"); - auto emb_names = op_desc.Input("Embs"); + auto word_id_name = op_desc.Input("WordId").front(); + auto pos_id_name = op_desc.Input("PosId").front(); + auto sent_id_name = op_desc.Input("SentId").front(); + auto word_emb_name = op_desc.Input("WordEmbedding").front(); + auto pos_emb_name = op_desc.Input("PosEmbedding").front(); + auto sent_emb_name = op_desc.Input("SentEmbedding").front(); + std::vector id_names = {word_id_name, pos_id_name, + sent_id_name}; + std::vector emb_names = {word_emb_name, pos_emb_name, + sent_emb_name}; - PADDLE_ENFORCE_EQ(id_names.size(), emb_names.size(), - platform::errors::InvalidArgument( - "The id and emb size of fused EmbEltwiseLayerNormOp " - "should be same ")); int input_num = id_names.size(); // Declare inputs @@ -89,97 +93,98 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { int64_t bias_size = framework::product(bias_dims); int64_t scale_size = framework::product(scale_dims); nvinfer1::ILayer* layer = nullptr; + bool enable_int8 = op_desc.HasAttr("enable_int8"); - if (engine_->with_dynamic_shape()) { - if (engine_->use_oss()) { - int output_fp16 = static_cast((engine_->WithFp16() == 1) ? 1 : 0); - PADDLE_ENFORCE_EQ( - output_fp16, 1, - platform::errors::InvalidArgument( - "Only Precision::KHalf(fp16) is supported when infering " - "ernie(bert) model with config.EnableTensorRtOSS(). " - "But Precision::KFloat32 is setted.")); - const std::vector fields{ - {"bert_embeddings_layernorm_beta", bias, - nvinfer1::PluginFieldType::kFLOAT32, - static_cast(bias_size)}, - {"bert_embeddings_layernorm_gamma", scale, - nvinfer1::PluginFieldType::kFLOAT32, - static_cast(scale_size)}, - {"bert_embeddings_word_embeddings", input_embs[0], - nvinfer1::PluginFieldType::kFLOAT32, - static_cast(emb_sizes[0])}, - {"bert_embeddings_token_type_embeddings", input_embs[2], - nvinfer1::PluginFieldType::kFLOAT32, - static_cast(emb_sizes[2])}, - {"bert_embeddings_position_embeddings", input_embs[1], - nvinfer1::PluginFieldType::kFLOAT32, - static_cast(emb_sizes[1])}, - {"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1}, - }; - - // remember to free - nvinfer1::PluginFieldCollection* plugin_ptr = - static_cast( - malloc(sizeof(*plugin_ptr) + - fields.size() * sizeof(nvinfer1::PluginField))); - plugin_ptr->nbFields = static_cast(fields.size()); - plugin_ptr->fields = fields.data(); - - std::vector plugin_inputs; - plugin_inputs.emplace_back(engine_->GetITensor( - engine_->network()->getInput(0)->getName())); // word_embedding, - // eval_placeholder_0 - plugin_inputs.emplace_back(engine_->GetITensor( - engine_->network()->getInput(1)->getName())); // sent_embedding, - // eval_placeholder_1 - plugin_inputs.emplace_back(engine_->GetITensor( - engine_->network()->getInput(2)->getName())); // cu_seqlens, - // eval_placeholder_2 - auto max_seqlen_tensor = - engine_->GetITensor(engine_->network()->getInput(3)->getName()); - auto* shuffle_layer = TRT_ENGINE_ADD_LAYER( - engine_, Shuffle, - *const_cast(max_seqlen_tensor)); - nvinfer1::Dims shape_dim; - shape_dim.nbDims = 1; - shape_dim.d[0] = -1; - shuffle_layer->setReshapeDimensions(shape_dim); - plugin_inputs.emplace_back( - shuffle_layer->getOutput(0)); // max_seqlen, eval_placeholder_3 - - auto creator = GetPluginRegistry()->getPluginCreator( - "CustomEmbLayerNormPluginDynamic", "2"); - - auto plugin_obj = creator->createPlugin( - "CustomEmbLayerNormPluginDynamic", plugin_ptr); - auto plugin_layer = engine_->network()->addPluginV2( - plugin_inputs.data(), plugin_inputs.size(), *plugin_obj); - layer = plugin_layer; - free(plugin_ptr); - auto output_name = op_desc.Output("Out")[0]; - RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", - {output_name, std::string("qkv_plugin_mask")}, - test_mode); - } else { - bool with_fp16 = - engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); - float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon")); - plugin::DynamicPluginTensorRT* plugin = nullptr; - plugin = new plugin::EmbEltwiseLayernormPluginDynamic( - input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden, - eps, with_fp16); - layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin); - auto output_name = op_desc.Output("Out")[0]; - RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name}, - test_mode); + if (engine_->use_oss()) { + int output_fp16 = static_cast((engine_->WithFp16() == 1) ? 1 : 0); + if (enable_int8) { + output_fp16 = 1; } + PADDLE_ENFORCE_EQ( + input_num, 3, + platform::errors::InvalidArgument( + "When using oss and var-len, embedding_eltwise_layernorm op" + "should have 3 inputs only, but got %d.", + input_num)); + PADDLE_ENFORCE_EQ( + output_fp16, 1, + platform::errors::InvalidArgument( + "Only Precision::KHalf(fp16) is supported when infering " + "ernie(bert) model with config.EnableTensorRtOSS(). " + "But Precision::KFloat32 is setted.")); + const std::vector fields{ + {"bert_embeddings_layernorm_beta", bias, + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(bias_size)}, + {"bert_embeddings_layernorm_gamma", scale, + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(scale_size)}, + {"bert_embeddings_word_embeddings", input_embs[0], + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(emb_sizes[0])}, + {"bert_embeddings_token_type_embeddings", input_embs[2], + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(emb_sizes[2])}, + {"bert_embeddings_position_embeddings", input_embs[1], + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(emb_sizes[1])}, + {"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1}, + }; + + // remember to free + nvinfer1::PluginFieldCollection* plugin_ptr = + static_cast( + malloc(sizeof(*plugin_ptr) + + fields.size() * sizeof(nvinfer1::PluginField))); + plugin_ptr->nbFields = static_cast(fields.size()); + plugin_ptr->fields = fields.data(); + + std::vector plugin_inputs; + plugin_inputs.emplace_back( + engine_->GetITensor(word_id_name)); // word_embedding, + // eval_placeholder_0 + plugin_inputs.emplace_back( + engine_->GetITensor(sent_id_name)); // sent_embedding, + // eval_placeholder_1 + plugin_inputs.emplace_back( + engine_->GetITensor(pos_id_name)); // cu_seqlens, + // eval_placeholder_2 + auto max_seqlen_tensor = + engine_->GetITensor(engine_->network()->getInput(3)->getName()); + auto* shuffle_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *max_seqlen_tensor); + nvinfer1::Dims shape_dim; + shape_dim.nbDims = 1; + shape_dim.d[0] = -1; + shuffle_layer->setReshapeDimensions(shape_dim); + plugin_inputs.emplace_back( + shuffle_layer->getOutput(0)); // max_seqlen, eval_placeholder_3 + + auto creator = GetPluginRegistry()->getPluginCreator( + "CustomEmbLayerNormPluginDynamic", "2"); + + auto plugin_obj = + creator->createPlugin("CustomEmbLayerNormPluginDynamic", plugin_ptr); + auto plugin_layer = engine_->network()->addPluginV2( + plugin_inputs.data(), plugin_inputs.size(), *plugin_obj); + layer = plugin_layer; + free(plugin_ptr); + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", + {output_name, std::string("qkv_plugin_mask")}, + test_mode); } else { - PADDLE_THROW(platform::errors::Fatal( - "You are running the Ernie(Bert) model in static" - "shape mode, which is not supported for the time being.\n" - "You can use the config.SetTRTDynamicShapeInfo(...) interface" - " to set the shape information to run the dynamic shape mode.")); + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon")); + plugin::DynamicPluginTensorRT* plugin = nullptr; + plugin = new plugin::EmbEltwiseLayernormPluginDynamic( + input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden, + eps, with_fp16); + layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin); + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name}, + test_mode); } #else diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index 41fbbb557d6470228b9315f8d0e0ed1e5ad905ac..d2dcd4d11bfc8fa0b1021a2481ff930527567a9f 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -37,7 +37,7 @@ class FcOpConverter : public OpConverter { const framework::Scope& scope, bool test_mode) override { VLOG(3) << "convert a fluid fc op to tensorrt fc layer without bias"; framework::OpDesc op_desc(op, nullptr); - + auto output_name = op_desc.Output("Out").front(); auto input_names = op_desc.InputNames(); bool with_bias = input_names.size() >= 3; std::string w_name = "Y"; @@ -54,7 +54,7 @@ class FcOpConverter : public OpConverter { Y_v, platform::errors::NotFound( "Can not find %s presistale var of fc in scope.", w_name)); auto* Y_t = Y_v->GetMutable(); - const int x_num_col_dims = + int x_num_col_dims = op_desc.HasAttr("x_num_col_dims") ? BOOST_GET_CONST(int, op_desc.GetAttr("x_num_col_dims")) : (op_desc.HasAttr("in_num_col_dims") @@ -106,17 +106,55 @@ class FcOpConverter : public OpConverter { auto regist_fc = [&](nvinfer1::ITensor* inputs, int n_output, TensorRTEngine::Weight& weight, TensorRTEngine::Weight& bias) { - auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs, - n_output, weight.get(), bias.get()); - - auto output_name = op_desc.Output("Out").front(); - if (activation_type == "relu") { - nvinfer1::IActivationLayer* relu_layer = - TRT_ENGINE_ADD_LAYER(engine_, Activation, *(fc_layer->getOutput(0)), - nvinfer1::ActivationType::kRELU); - RreplenishLayerAndOutput(relu_layer, "fc", {output_name}, test_mode); + if (enable_int8) { + // add conv layer + PADDLE_ENFORCE_EQ( + op_desc.HasAttr("out_threshold"), true, + platform::errors::InvalidArgument( + "must have out threshold in fc layers in int8 mode")); + float out_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); + nvinfer1::DimsHW nv_ksize(1, 1); + auto* fc_layer_int8 = + TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output, + nv_ksize, weight.get(), bias.get()); + engine_->SetTensorDynamicRange(fc_layer_int8->getOutput(0), out_scale); + if (activation_type == "relu") { + nvinfer1::IActivationLayer* relu_layer_int8 = TRT_ENGINE_ADD_LAYER( + engine_, Activation, *(fc_layer_int8->getOutput(0)), + nvinfer1::ActivationType::kRELU); + RreplenishLayerAndOutput(relu_layer_int8, "relu_after_fc_shuffle", + {output_name}, test_mode); + } else { + RreplenishLayerAndOutput(fc_layer_int8, "shuffle_after_fc", + {output_name}, test_mode); + } } else { - RreplenishLayerAndOutput(fc_layer, "fc", {output_name}, test_mode); + // add fc layer + auto* fc_layer_before = + TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs, n_output, + weight.get(), bias.get()); + fc_layer_before->setName( + ("fc_layer_before(Output: " + output_name + ")").c_str()); + // add shuffle after fc + nvinfer1::Dims reshape_after_fc_dim; + reshape_after_fc_dim.nbDims = x_num_col_dims + 1; + for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) { + reshape_after_fc_dim.d[i] = 0; + } + auto* fc_layer_float = TRT_ENGINE_ADD_LAYER( + engine_, Shuffle, *fc_layer_before->getOutput(0)); + fc_layer_float->setReshapeDimensions(reshape_after_fc_dim); + if (activation_type == "relu") { + nvinfer1::IActivationLayer* relu_layer_float = TRT_ENGINE_ADD_LAYER( + engine_, Activation, *(fc_layer_float->getOutput(0)), + nvinfer1::ActivationType::kRELU); + RreplenishLayerAndOutput(relu_layer_float, "relu_after_fc_shuffle", + {output_name}, test_mode); + } else { + RreplenishLayerAndOutput(fc_layer_float, "shuffle_after_fc", + {output_name}, test_mode); + } } }; @@ -143,71 +181,43 @@ class FcOpConverter : public OpConverter { static_cast(bias_data), static_cast(bias_num)}; - if (engine_->with_dynamic_shape()) { - regist_fc(X, n_output, weight, bias); - return; + auto x_dim = X->getDimensions(); + // Running the TRT Static Shape mode: x_num_col_dims-1 + if (!engine_->with_dynamic_shape()) { + x_num_col_dims--; } - // in order to handle situations in NLP models(input dims < 3, - // x_num_col_dims != 1, etc.), reshape input to perform FC correctly. - auto* reshape_itensor = X; - int input_dims = X->getDimensions().nbDims; - auto input_d = X->getDimensions().d; - int reshape_dim3[3] = {0}; - int reshape_dim4[4] = {0}; - PADDLE_ENFORCE_EQ( - x_num_col_dims == 1 || x_num_col_dims == 2, true, + PADDLE_ENFORCE_GT( + x_dim.nbDims, x_num_col_dims, platform::errors::InvalidArgument( - "Wrong x_num_col_dims param of op mul. Paddle-TRT FC converter " - "expects x_num_col_dims is either 1 or 2, but got %d", - x_num_col_dims)); - PADDLE_ENFORCE_LE(x_num_col_dims, input_dims, - platform::errors::InvalidArgument( - "Params and input dims mismatch. Paddle-TRT FC " - "converter expects x_num_col_dims <= input dims")); - if (x_num_col_dims == 1) { - if (input_dims == 4) { - PADDLE_ENFORCE_EQ( - input_d[3], 1, - platform::errors::InvalidArgument( - "Invalid dimensions. When x_num_col_dims equals to 1 and input " - "dims equals to 4, the last dim of input must be 1, but got %d", - input_d[3])); - } - for (int i = 0; i < 3; i++) { - if (i < input_dims) { - reshape_dim3[i] = input_d[i]; - } else { - reshape_dim3[i] = 1; - } - } - nvinfer1::Dims3 reshape_dim(reshape_dim3[0], reshape_dim3[1], - reshape_dim3[2]); - auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); - reshape_layer->setReshapeDimensions(reshape_dim); - reshape_itensor = reshape_layer->getOutput(0); - if (enable_int8) { - engine_->SetTensorDynamicRange(reshape_itensor, in_scale); - } - } else { - PADDLE_ENFORCE_NE(input_dims, 1, - platform::errors::InvalidArgument( - "Invalid dimensions. When x_num_col_dims equals to " - "2, input_dims should not be 1")); - for (int i = 0; i < 4; i++) { - if (i < input_dims) { - reshape_dim4[i] = input_d[i]; - } else { - reshape_dim4[i] = 1; + "Params and input dims mismatch. Paddle-TRT FC " + "converter expects x_dim.nbDims > x_num_col_dims, but " + "x_dim.nbDims : %d, x_num_col_dims : %d.", + x_dim.nbDims, x_num_col_dims)); + // add shuffle before fc + nvinfer1::Dims reshape_before_fc_dim; + reshape_before_fc_dim.nbDims = x_num_col_dims + 3; + // padding shape "* x q x 1 x 1" + for (int i = 0; i < reshape_before_fc_dim.nbDims; i++) { + reshape_before_fc_dim.d[i] = 1; + } + for (int i = 0; i < x_dim.nbDims; i++) { + if (i < x_num_col_dims) { + reshape_before_fc_dim.d[i] = 0; + } else { + if (x_dim.d[i] < 0) { + reshape_before_fc_dim.d[x_num_col_dims] = -1; + break; } + reshape_before_fc_dim.d[x_num_col_dims] *= x_dim.d[i]; } - nvinfer1::Dims4 reshape_dim(reshape_dim4[0], reshape_dim4[1], - reshape_dim4[2], reshape_dim4[3]); - auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); - reshape_layer->setReshapeDimensions(reshape_dim); - reshape_itensor = reshape_layer->getOutput(0); - if (enable_int8) { - engine_->SetTensorDynamicRange(reshape_itensor, in_scale); - } + } + auto* reshape_before_fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); + reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim); + reshape_before_fc_layer->setName( + ("shuffle_before_fc(Output: " + output_name + ")").c_str()); + auto* reshape_itensor = reshape_before_fc_layer->getOutput(0); + if (enable_int8) { + engine_->SetTensorDynamicRange(reshape_itensor, in_scale); } regist_fc(reshape_itensor, n_output, weight, bias); } diff --git a/paddle/fluid/inference/tensorrt/convert/gather_op.cc b/paddle/fluid/inference/tensorrt/convert/gather_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..346a8bffa00e383781a2e0a26afaa97437598b8d --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/gather_op.cc @@ -0,0 +1,78 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Gather Op + */ +class GatherOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid gather op to tensorrt gather layer"; + + framework::OpDesc op_desc(op, nullptr); + std::string input_name = op_desc.Input("X").front(); + std::string index_name = op_desc.Input("Index").front(); + std::string output_name = op_desc.Output("Out").front(); + + const auto input_tensor = engine_->GetITensor(input_name); + const auto index_tensor = engine_->GetITensor(index_name); + + const int axis = 0; + + auto layer = TRT_ENGINE_ADD_LAYER(engine_, Gather, *input_tensor, + *index_tensor, axis); + + auto odim = layer->getOutput(0)->getDimensions(); + + auto reshape_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *layer->getOutput(0)); + + nvinfer1::Dims target_shape{}; + target_shape.nbDims = odim.nbDims - 1; + for (int i = 0; i < axis; ++i) { + target_shape.d[i] = odim.d[i]; + } + target_shape.d[axis] = 0; + for (int i = axis + 1; i < target_shape.nbDims; ++i) { + target_shape.d[i] = odim.d[i + 1]; + } + + reshape_layer->setReshapeDimensions(target_shape); + + RreplenishLayerAndOutput(reshape_layer, "gather", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(gather, GatherOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc index 4c9996ca02cad48950c2c68763e2c0270cd1f9e4..0436499cd40756150d5b33c6d685d74ffbe5b87d 100644 --- a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc @@ -47,15 +47,7 @@ class GeluOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); // Declare inputs int input_num = op_desc.Input("X").size(); - PADDLE_ENFORCE_EQ(input_num, 1, - platform::errors::InvalidArgument( - "gelu op has only 1 input, but got %d", input_num)); auto* input = engine_->GetITensor(op_desc.Input("X")[0]); - // Get output - size_t output_num = op_desc.Output("Out").size(); - PADDLE_ENFORCE_EQ(output_num, 1, - platform::errors::InvalidArgument( - "gelu op has only 1 output, but got %d", output_num)); nvinfer1::ILayer* layer = nullptr; if (engine_->with_dynamic_shape()) { @@ -64,7 +56,7 @@ class GeluOpConverter : public OpConverter { engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::GeluPluginDynamic* plugin = new plugin::GeluPluginDynamic(with_fp16); - layer = engine_->AddPluginV2(&input, input_num, plugin); + layer = engine_->AddDynamicPlugin(&input, input_num, plugin); #else PADDLE_THROW(platform::errors::Fatal( "You are running the TRT Dynamic Shape mode, need to confirm that " diff --git a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc index 9dc40ceec4809489bc308485934150e8d1491c83..7ef79e547d09ab678fd1ce43c301bd893ea4e822 100644 --- a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc @@ -41,17 +41,7 @@ class HardSwishOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); // Declare inputs int input_num = op_desc.Input("X").size(); - PADDLE_ENFORCE_EQ( - input_num, 1, - platform::errors::InvalidArgument( - "HardSwish op has only 1 input, but got %d", input_num)); auto* input = engine_->GetITensor(op_desc.Input("X")[0]); - // Get output - size_t output_num = op_desc.Output("Out").size(); - PADDLE_ENFORCE_EQ( - output_num, 1, - platform::errors::InvalidArgument( - "HardSwish op has only 1 output, but got %d", output_num)); const float threshold = op_desc.HasAttr("threshold") diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc index c1f266bacfec5784d6f641574ce20f91496f2f35..0b97b5d87a3d506e9e14ea5780a9e7b4ac471c83 100644 --- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc @@ -25,25 +25,6 @@ class LayerNormOpConverter : public OpConverter { const framework::Scope& scope, bool test_mode) override { VLOG(4) << "convert a fluid layer_norm op to tensorrt layer_norm plugin"; framework::OpDesc op_desc(op, nullptr); - PADDLE_ENFORCE_EQ( - op_desc.Input("X").size(), 1, - platform::errors::InvalidArgument( - "input of layer_norm op converter should be 1, got %d", - op_desc.Input("X").size())); - PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1, - platform::errors::InvalidArgument( - "Bias of layer_norm op converter should be 1, got %d", - op_desc.Input("Bias").size())); // Bias is a weight - PADDLE_ENFORCE_EQ( - op_desc.Input("Scale").size(), 1, - platform::errors::InvalidArgument( - "Scale of layer_norm op converter should be 1, got %d", - op_desc.Input("Scale").size())); // Scale is a weight - PADDLE_ENFORCE_EQ( - op_desc.Output("Y").size(), 1, - platform::errors::InvalidArgument( - "output of layer_norm op converter should be 1, got %d", - op_desc.Input("Y").size())); auto* X = engine_->GetITensor(op_desc.Input("X").front()); auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front()); diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc index c2ffb3f3197c15d24d317f2fc3290f650e753d7a..d6277b5208d5a1588cddc366e66995046f77f3fd 100644 --- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc @@ -36,21 +36,7 @@ class LeakyReluOpConverter : public OpConverter { VLOG(4) << "convert fluid leaky_relu op to tensorrt layer"; framework::OpDesc op_desc(op, nullptr); - // Declare inputs - size_t input_num = op_desc.Input("X").size(); - PADDLE_ENFORCE_EQ(input_num, 1UL, - platform::errors::InvalidArgument( - "Invalid number of TRT leaky_relu op converter " - "inputs. Expected 1, but received %d", - input_num)); auto* input = engine_->GetITensor(op_desc.Input("X")[0]); - // Get output - size_t output_num = op_desc.Output("Out").size(); - PADDLE_ENFORCE_EQ(output_num, 1UL, - platform::errors::InvalidArgument( - "Invalid number of TRT leaky_relu op converter " - "outputs. Expected 1, but received %d", - output_num)); // Get attrs float alpha = BOOST_GET_CONST(float, op_desc.GetAttr("alpha")); nvinfer1::ILayer* output_layer = nullptr; diff --git a/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b0d67a5bf90ca9fcad742367a4c1a3c2c3eb0ee2 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc @@ -0,0 +1,133 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace framework { +class Scope; +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +class MultiClassNMSOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid multiclassNMS op to tensorrt plugin"; + + // for now, only work for static shape and regular tensor + framework::OpDesc op_desc(op, nullptr); + + std::string bboxes = op_desc.Input("BBoxes").front(); + std::string scores = op_desc.Input("Scores").front(); + std::string output_name = op_desc.Output("Out").front(); + + auto* bboxes_tensor = engine_->GetITensor(bboxes); + auto* scores_tensor = engine_->GetITensor(scores); + + int background_label = + BOOST_GET_CONST(int, op_desc.GetAttr("background_label")); + float score_threshold = + BOOST_GET_CONST(float, op_desc.GetAttr("score_threshold")); + int nms_top_k = BOOST_GET_CONST(int, op_desc.GetAttr("nms_top_k")); + float nms_threshold = + BOOST_GET_CONST(float, op_desc.GetAttr("nms_threshold")); + int keep_top_k = BOOST_GET_CONST(int, op_desc.GetAttr("keep_top_k")); + bool normalized = BOOST_GET_CONST(bool, op_desc.GetAttr("normalized")); + int num_classes = scores_tensor->getDimensions().d[0]; + + auto bboxes_dims = bboxes_tensor->getDimensions(); + nvinfer1::Dims3 bboxes_expand_dims(bboxes_dims.d[0], 1, bboxes_dims.d[1]); + auto* bboxes_expand_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *bboxes_tensor); + bboxes_expand_layer->setReshapeDimensions(bboxes_expand_dims); + + nvinfer1::Permutation permutation{1, 0}; + auto* scores_transpose_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scores_tensor); + scores_transpose_layer->setFirstTranspose(permutation); + + std::vector batch_nms_inputs; + batch_nms_inputs.push_back(bboxes_expand_layer->getOutput(0)); + batch_nms_inputs.push_back(scores_transpose_layer->getOutput(0)); + + constexpr bool shareLocation = true; + constexpr bool clip_boxes = false; + + const std::vector fields{ + {"shareLocation", &shareLocation, nvinfer1::PluginFieldType::kINT32, 1}, + {"backgroundLabelId", &background_label, + nvinfer1::PluginFieldType::kINT32, 1}, + {"numClasses", &num_classes, nvinfer1::PluginFieldType::kINT32, 1}, + {"topK", &nms_top_k, nvinfer1::PluginFieldType::kINT32, 1}, + {"keepTopK", &keep_top_k, nvinfer1::PluginFieldType::kINT32, 1}, + {"scoreThreshold", &score_threshold, + nvinfer1::PluginFieldType::kFLOAT32, 1}, + {"iouThreshold", &nms_threshold, nvinfer1::PluginFieldType::kFLOAT32, + 1}, + {"isNormalized", &normalized, nvinfer1::PluginFieldType::kINT32, 1}, + {"clipBoxes", &clip_boxes, nvinfer1::PluginFieldType::kINT32, 1}, + }; + + nvinfer1::PluginFieldCollection* plugin_collections = + static_cast( + malloc(sizeof(*plugin_collections) + + fields.size() * sizeof(nvinfer1::PluginField))); + plugin_collections->nbFields = static_cast(fields.size()); + plugin_collections->fields = fields.data(); + + auto creator = GetPluginRegistry()->getPluginCreator("BatchedNMS_TRT", "1"); + auto batch_nms_plugin = + creator->createPlugin("BatchNMSPlugin", plugin_collections); + free(plugin_collections); + + auto batch_nms_layer = engine_->network()->addPluginV2( + batch_nms_inputs.data(), batch_nms_inputs.size(), *batch_nms_plugin); + auto nmsed_boxes = batch_nms_layer->getOutput(1); + auto nmsed_scores = batch_nms_layer->getOutput(2); + auto nmsed_classes = batch_nms_layer->getOutput(3); + + auto nmsed_scores_transpose_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *nmsed_scores); + nmsed_scores_transpose_layer->setReshapeDimensions( + nvinfer1::Dims2(keep_top_k, 1)); + auto nmsed_classes_reshape_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *nmsed_classes); + nmsed_classes_reshape_layer->setReshapeDimensions( + nvinfer1::Dims2(keep_top_k, 1)); + + std::vector concat_inputs; + concat_inputs.push_back(nmsed_classes_reshape_layer->getOutput(0)); + concat_inputs.push_back(nmsed_scores_transpose_layer->getOutput(0)); + concat_inputs.push_back(nmsed_boxes); + + auto nms_concat_layer = TRT_ENGINE_ADD_LAYER( + engine_, Concatenation, concat_inputs.data(), concat_inputs.size()); + nms_concat_layer->setAxis(1); + + RreplenishLayerAndOutput(nms_concat_layer, "multiclass_nms", {output_name}, + test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(multiclass_nms, MultiClassNMSOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index ee04fd372c4588b492948a5acd23493d2ba423c5..f2f45c694ab44fb03cfd6b018ef0a0a1ae6f0a31 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -8,8 +8,8 @@ http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See +the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" @@ -28,7 +28,6 @@ class MultiheadMatMulOpConverter : public OpConverter { "network structure"; framework::OpDesc op_desc(op, nullptr); // Declare inputs - // Shouble be a 5 dims tensor. auto* input = engine_->GetITensor(op_desc.Input("Input").front()); // fc weights and fc bias @@ -41,8 +40,25 @@ class MultiheadMatMulOpConverter : public OpConverter { auto* bias_v = scope.FindVar(bias_name); auto* bias_t = bias_v->GetMutable(); - float* weight_data = - engine_->GetWeightCPUData(weight_name, weight_t, false); + float* weight_data = nullptr; + bool enable_int8 = op_desc.HasAttr("enable_int8"); + float in_scale = 0.; + + if (enable_int8) { + PADDLE_ENFORCE_EQ( + op_desc.HasAttr("Input_scale"), true, + platform::errors::InvalidArgument( + "must have input scale in multihead layers in int8 mode")); + in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127; + auto weight_scale = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("weight_scale")); + weight_data = + engine_->GetWeightCPUData(weight_name, weight_t, true, weight_scale); + engine_->SetTensorDynamicRange(input, in_scale); + } else { + weight_data = engine_->GetWeightCPUData(weight_name, weight_t, false); + } + float* bias_data = engine_->GetWeightCPUData(bias_name, bias_t, false); std::vector weight_data_tmp; weight_data_tmp.reserve(weight_t->numel()); @@ -69,6 +85,7 @@ class MultiheadMatMulOpConverter : public OpConverter { int head_number = BOOST_GET_CONST(int, op_desc.GetAttr("head_number")); nvinfer1::ILayer* layer = nullptr; + auto output_name = op_desc.Output("Out")[0]; if (engine_->with_dynamic_shape()) { if (engine_->use_oss()) { @@ -117,8 +134,27 @@ class MultiheadMatMulOpConverter : public OpConverter { static_cast(bias_data), static_cast(bias_t->numel())}; - auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, - n, weight, bias); + nvinfer1::ILayer* fc_layer = nullptr; + float dp_probs = 1.0 / 127.0; + if (enable_int8) { + nvinfer1::DimsHW nv_ksize(1, 1); + fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *input, n, + nv_ksize, weight, bias); + } else { + fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, n, + weight, bias); + } + + if (enable_int8) { + PADDLE_ENFORCE_EQ( + op_desc.HasAttr("out_threshold"), true, + platform::errors::InvalidArgument( + "must have out threshold in multihead layers in int8 mode")); + float out_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); + engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale); + dp_probs = out_scale / 127.0; + } auto mask_tensor = engine_->GetITensor("qkv_plugin_mask"); @@ -128,6 +164,9 @@ class MultiheadMatMulOpConverter : public OpConverter { int type = static_cast((engine_->WithFp16() == 1) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT); + if (enable_int8) { + type = static_cast(nvinfer1::DataType::kHALF); + } bool has_mask = true; int var_seqlen = 1; const std::vector fields{ @@ -136,7 +175,7 @@ class MultiheadMatMulOpConverter : public OpConverter { {"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1}, {"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1}, {"var_seqlen", &var_seqlen, nvinfer1::PluginFieldType::kINT32, 1}, - }; + { "dq_probs", &dp_probs, nvinfer1::PluginFieldType::kFLOAT32, 1 }}; nvinfer1::PluginFieldCollection* plugin_collection = static_cast( malloc(sizeof(*plugin_collection) + @@ -171,6 +210,12 @@ class MultiheadMatMulOpConverter : public OpConverter { plugin_inputs.data(), plugin_inputs.size(), *plugin); layer = plugin_layer; } else { + PADDLE_ENFORCE_EQ( + input->getDimensions().nbDims, 3, + platform::errors::InvalidArgument( + "The Input dim of the MultiheadMatMul should be 3, " + "but it's (%d) now.", + input->getDimensions().nbDims)); // transpose weight_data from m * n to n * m auto* input_bias_qk = engine_->GetITensor(op_desc.Input("BiasQK").front()); @@ -184,22 +229,44 @@ class MultiheadMatMulOpConverter : public OpConverter { static_cast(bias_data), static_cast(bias_t->numel())}; - auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, - n, weight.get(), bias.get()); - auto* fc_out = fc_layer->getOutput(0); + // add shuffle before fc + nvinfer1::Dims reshape_before_fc_dim; + reshape_before_fc_dim.nbDims = 5; + reshape_before_fc_dim.d[0] = 0; + reshape_before_fc_dim.d[1] = 0; + reshape_before_fc_dim.d[2] = 0; + reshape_before_fc_dim.d[3] = 1; + reshape_before_fc_dim.d[4] = 1; + auto* reshape_before_fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); + reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim); + reshape_before_fc_layer->setName( + ("shuffle_before_multihead_mamul(Output: " + output_name + ")") + .c_str()); + + // add layer fc + auto* fc_layer = TRT_ENGINE_ADD_LAYER( + engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), n, + weight.get(), bias.get()); + fc_layer->setName( + ("multihead_mamul_fc(Output: " + output_name + ")").c_str()); + + // no need to add shuffle after fc, just change it in + // QkvToContextPluginDynamic + // add qkv to context int head_size = hidden_out / head_number; float scale = BOOST_GET_CONST(float, op_desc.GetAttr("alpha")); std::vector plugin_inputs; - plugin_inputs.push_back(fc_out); + plugin_inputs.push_back(fc_layer->getOutput(0)); plugin_inputs.push_back(input_bias_qk); bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::DynamicPluginTensorRT* plugin = new plugin::QkvToContextPluginDynamic(hidden_in, head_number, head_size, scale, with_fp16); - layer = engine_->AddPluginV2(plugin_inputs.data(), 2, plugin); + layer = engine_->AddDynamicPlugin(plugin_inputs.data(), 2, plugin); } } else { PADDLE_THROW(platform::errors::Fatal( @@ -208,7 +275,6 @@ class MultiheadMatMulOpConverter : public OpConverter { "You can use the config.SetTRTDynamicShapeInfo(...) interface to set " "the shape information to run the dynamic shape mode.")); } - auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(layer, "multihead_matmul", {output_name}, test_mode); #else diff --git a/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3940cc5dce1b0003eb947e63706b9ebd0463ef6a --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc @@ -0,0 +1,107 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace framework { +class Scope; +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +class NearestInterpolateOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid nearest_interp op"; + + framework::OpDesc op_desc(op, nullptr); + + std::string input_name = op_desc.Input("X").front(); + std::string output_name = op_desc.Output("Out").front(); + + auto input = engine_->GetITensor(input_name); + + auto data_layout = framework::StringToDataLayout( + BOOST_GET_CONST(std::string, op_desc.GetAttr("data_layout"))); + auto interp_method = + BOOST_GET_CONST(std::string, op_desc.GetAttr("interp_method")); + bool align_corners = + BOOST_GET_CONST(bool, op_desc.GetAttr("align_corners")); + + auto input_names = op_desc.Input("X"); + auto scale = BOOST_GET_CONST(float, op_desc.GetAttr("scale")); + auto out_h = BOOST_GET_CONST(int, op_desc.GetAttr("out_h")); + auto out_w = BOOST_GET_CONST(int, op_desc.GetAttr("out_w")); + + auto layer = TRT_ENGINE_ADD_LAYER(engine_, Resize, *input); + layer->setAlignCorners(align_corners); + + auto in_dim = input->getDimensions(); + + float scale_h = 1.f; + float scale_w = 1.f; + + std::vector scales; + + if (scale > 0.f && (out_h <= 0 && out_w <= 0)) { + scale_h = scale; + scale_w = scale; + } else { + // axis are different in static/dynamic mode + bool with_dynamic = engine_->with_dynamic_shape(); + + int h_axis = (data_layout == framework::DataLayout::kNCHW) + with_dynamic; + int w_axis = + (data_layout == framework::DataLayout::kNCHW) + 1 + with_dynamic; + + scale_h = + static_cast(out_h) / static_cast(in_dim.d[h_axis]); + scale_w = + static_cast(out_w) / static_cast(in_dim.d[w_axis]); + } + + if (engine_->with_dynamic_shape()) { + scales.push_back(1.f); + } + + if (data_layout == framework::DataLayout::kNCHW) { + scales.push_back(1.f); + scales.push_back(scale_h); + scales.push_back(scale_w); + } else if (data_layout == framework::DataLayout::kNHWC) { + // NHWC + scales.push_back(scale_h); + scales.push_back(scale_w); + scales.push_back(1.f); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Data layout must be NCHW or NHWC.")); + } + layer->setScales(scales.data(), scales.size()); + + RreplenishLayerAndOutput(layer, "nearest_interp", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(nearest_interp, NearestInterpolateOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 8de16df0a2f610b30da389bc73e122074d66471e..f72ae2c3ec2d7e013247f294a6f3e6dd4572ae35 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -109,6 +109,12 @@ class OpConverter { it, platform::errors::Unimplemented("no OpConverter for optype [%s]", op_desc.Type())); } + if (op_desc.Type() == "depthwise_conv2d_transpose") { + it = Registry::Global().Lookup("conv2d_transpose"); + PADDLE_ENFORCE_NOT_NULL( + it, platform::errors::Unimplemented("no OpConverter for optype [%s]", + op_desc.Type())); + } if (op_desc.Type() == "transpose2") { it = Registry::Global().Lookup("transpose"); PADDLE_ENFORCE_NOT_NULL( diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc index 6bf50e4742dd28af1ad96670f2d15406bbb3987e..d6711bbbd2cb52fc4508f100ab1e5f1781cc4177 100644 --- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc @@ -43,8 +43,6 @@ class PadOpConverter : public OpConverter { const std::vector paddings = BOOST_GET_CONST(std::vector, op_desc.GetAttr("paddings")); - const float pad_value = - BOOST_GET_CONST(float, op_desc.GetAttr("pad_value")); nvinfer1::Dims input_shape = input->getDimensions(); int nbDims = input_shape.nbDims; @@ -62,9 +60,6 @@ class PadOpConverter : public OpConverter { "(nbDims + 1) * 2 == pad_size. But " "received nbDims:%d, pad_size:%d.", nbDims, pad_size)); - PADDLE_ENFORCE_EQ(pad_value, 0.0, - platform::errors::InvalidArgument( - "The pad layer of TRT only support zero.")); nvinfer1::DimsHW pre_pad(paddings[pad_size - 4], paddings[pad_size - 2]); nvinfer1::DimsHW post_pad(paddings[pad_size - 3], paddings[pad_size - 1]); diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index aa4e54b58457227330e22c2a3e29868fe6215de6..90d6392fd6404ef8b46e3ae6783c24691995fa00 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -66,15 +66,6 @@ class Pool2dOpConverter : public OpConverter { VLOG(4) << "convert a fluid pool2d op to tensorrt pool2d layer without bias"; framework::OpDesc op_desc(op, nullptr); - PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL, - platform::errors::InvalidArgument( - "TRT Pool2d expect 1 input, but got %d input.", - op_desc.Input("X").size())); - PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL, - platform::errors::InvalidArgument( - "TRT Pool2d expect 1 Output, but got %d output.", - op_desc.Output("Out").size())); - auto *input1 = engine_->GetITensor(op_desc.Input("X")[0]); nvinfer1::Dims input_shape = input1->getDimensions(); int input_dims = input_shape.nbDims; @@ -110,10 +101,6 @@ class Pool2dOpConverter : public OpConverter { nv_pool_type = nvinfer1::PoolingType::kAVERAGE; reduce_operation = nvinfer1::ReduceOperation::kAVG; plugin_pool_type = plugin::PoolPlugin::PoolType::avg; - } else { - PADDLE_THROW(platform::errors::Fatal( - "Wrong pool op type, the trt do not support the %s pool type.", - pool_type)); } nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]); @@ -147,7 +134,7 @@ class Pool2dOpConverter : public OpConverter { plugin::PoolPluginDynamic *plugin = new plugin::PoolPluginDynamic(ceil_mode, pool_type, adaptive, ksize, strides, paddings, global_pooling); - layer = engine_->AddPluginV2(&input1, 1, plugin); + layer = engine_->AddDynamicPlugin(&input1, 1, plugin); #endif } auto output_name = op_desc.Output("Out")[0]; diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc index 5e881ecbbc4e2cc8e81b9334dc827513bfad02eb..a8a36e1238168ad368a02bf2ebed915939c3d5c1 100644 --- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc @@ -31,19 +31,7 @@ class PReluOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); // Declare inputs size_t input_num = op_desc.Input("X").size(); - PADDLE_ENFORCE_EQ(input_num, 1UL, - platform::errors::InvalidArgument( - "Invalid input X's size of prelu TRT converter. " - "Expected 1, received %d.", - input_num)); auto* input = engine_->GetITensor(op_desc.Input("X")[0]); - // Get output - size_t output_num = op_desc.Output("Out").size(); - PADDLE_ENFORCE_EQ(output_num, 1UL, - platform::errors::InvalidArgument( - "Invalid output Out's size of prelu TRT converter. " - "Expected 1, received %d.", - output_num)); // Get attrs std::string mode = BOOST_GET_CONST(std::string, op_desc.GetAttr("mode")); // @@ -65,7 +53,7 @@ class PReluOpConverter : public OpConverter { #if IS_TRT_VERSION_GE(6000) plugin::PReluPluginDynamic* plugin = new plugin::PReluPluginDynamic( alpha_data, alpha_tensor_temp->numel(), mode); - layer = engine_->AddPluginV2(&input, input_num, plugin); + layer = engine_->AddDynamicPlugin(&input, input_num, plugin); #else PADDLE_THROW(platform::errors::Fatal( "You are running the TRT Dynamic Shape mode, need to confirm that " diff --git a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..654fe7e013379669c0d67e8690215b6eaca18443 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc @@ -0,0 +1,80 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Roi Align Op + */ +class RoiAlignOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid roi align op to tensorrt plugin"; + + framework::OpDesc op_desc(op, nullptr); + std::string input_name = op_desc.Input("X").front(); + std::string rois_name = op_desc.Input("ROIs").front(); + std::string output_name = op_desc.Output("Out").front(); + + const auto pooled_height = + BOOST_GET_CONST(int, op_desc.GetAttr("pooled_height")); + const auto pooled_width = + BOOST_GET_CONST(int, op_desc.GetAttr("pooled_width")); + const auto spatial_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("spatial_scale")); + const auto sampling_ratio = + BOOST_GET_CONST(int, op_desc.GetAttr("sampling_ratio")); + + const auto input_tensor = engine_->GetITensor(input_name); + const auto rois_tensor = engine_->GetITensor(rois_name); + + const nvinfer1::DataType data_type_ = engine_->WithFp16() + ? nvinfer1::DataType::kHALF + : nvinfer1::DataType::kFLOAT; + + std::vector inputs{input_tensor, rois_tensor}; + nvinfer1::ILayer* layer = nullptr; + + auto* roi_align_plugin = new plugin::RoiAlignPluginDynamic( + data_type_, pooled_height, pooled_width, spatial_scale, sampling_ratio); + auto roi_align_layer = engine_->network()->addPluginV2( + inputs.data(), inputs.size(), *roi_align_plugin); + layer = roi_align_layer; + + std::vector output_names{output_name}; + RreplenishLayerAndOutput(layer, "roi_align", output_names, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(roi_align, RoiAlignOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc index bf1f82076a66ce4a17ce7966bc0974d3de507008..0fdc262f7e740bc577bdb21a457d4288fcf7bf94 100644 --- a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc @@ -50,12 +50,6 @@ class ShuffleChannelOpConverter : public OpConverter { int w = input_dims.d[2]; int group = BOOST_GET_CONST(int, op_desc.GetAttr("group")); - if (engine_->with_dynamic_shape()) { - PADDLE_THROW(platform::errors::Fatal( - "You are running the TRT Dynamic Shape mode, " - "the shuffle_channel op does not support dynamic shape yet")); - } - auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); nvinfer1::Dims4 reshape_dim(group, c / group, h, w); layer->setReshapeDimensions(reshape_dim); diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc index 2e4a4e6120d2d835798f646b9c60b4fe2dbebf8e..e621ac0514109d40295cb402f1803b17da39bc87 100644 --- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc @@ -49,55 +49,60 @@ class SkipLayerNormOpConverter : public OpConverter { auto* scale = get_persistable_data("Scale", &scale_dims); int bias_size = framework::product(bias_dims); int scale_size = framework::product(scale_dims); + bool enable_int8 = op_desc.HasAttr("enable_int8"); nvinfer1::ILayer* layer = nullptr; - if (engine_->with_dynamic_shape()) { - if (engine_->use_oss()) { - auto creator = GetPluginRegistry()->getPluginCreator( - "CustomSkipLayerNormPluginDynamic", "2"); - assert(creator != nullptr); - int type = static_cast((engine_->WithFp16() == 1) - ? nvinfer1::DataType::kHALF - : nvinfer1::DataType::kFLOAT); - int ld = input1->getDimensions().d[2]; // hidden dimension - assert(ld > 0); - - const std::vector fields{ - {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1}, - {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1}, - {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size}, - {"gamma", scale, nvinfer1::PluginFieldType::kFLOAT32, scale_size}, - }; - nvinfer1::PluginFieldCollection* pluginPtr = - static_cast( - malloc(sizeof(*pluginPtr) + - fields.size() * - sizeof(nvinfer1::PluginField))); // remember to free - pluginPtr->nbFields = static_cast(fields.size()); - pluginPtr->fields = fields.data(); - - auto pluginObj = creator->createPlugin( - "CustomSkipLayerNormPluginDynamic", pluginPtr); - auto plugin_layer = engine_->network()->addPluginV2( - inputs.data(), inputs.size(), *pluginObj); - - assert(plugin_layer != nullptr); - layer = plugin_layer; - } else { - float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon")); - bool with_fp16 = - engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); - plugin::SkipLayerNormPluginDynamic* plugin = - new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size, - scale_size, eps, with_fp16); - layer = engine_->AddPluginV2(inputs.data(), 2, plugin); + + if (engine_->use_oss()) { + auto creator = GetPluginRegistry()->getPluginCreator( + "CustomSkipLayerNormPluginDynamic", "2"); + PADDLE_ENFORCE_NE( + creator, nullptr, + platform::errors::InvalidArgument( + "fail to get creator of CustomSkipLayerNormPluginDynamic")); + int type = static_cast((engine_->WithFp16() == 1) + ? nvinfer1::DataType::kHALF + : nvinfer1::DataType::kFLOAT); + int ld = input1->getDimensions().d[2]; // hidden dimension + PADDLE_ENFORCE_GT(ld, 0, platform::errors::InvalidArgument( + "in CustomSkipLayerNormPluginDynamic hidden " + "dimension should > 0")); + if (enable_int8) { + type = static_cast(nvinfer1::DataType::kHALF); } + + const std::vector fields{ + {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1}, + {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1}, + {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size}, + {"gamma", scale, nvinfer1::PluginFieldType::kFLOAT32, scale_size}, + }; + nvinfer1::PluginFieldCollection* pluginPtr = + static_cast( + malloc(sizeof(*pluginPtr) + + fields.size() * + sizeof(nvinfer1::PluginField))); // remember to free + pluginPtr->nbFields = static_cast(fields.size()); + pluginPtr->fields = fields.data(); + + auto pluginObj = + creator->createPlugin("CustomSkipLayerNormPluginDynamic", pluginPtr); + auto plugin_layer = engine_->network()->addPluginV2( + inputs.data(), inputs.size(), *pluginObj); + + PADDLE_ENFORCE_NE( + plugin_layer, nullptr, + platform::errors::InvalidArgument( + "fail to add CustomSkipLayerNormPluginDynamic layer")); + layer = plugin_layer; } else { - PADDLE_THROW(platform::errors::Fatal( - "You are running the Ernie(Bert) model in static" - "shape mode, which is not supported for the time being.\n" - "You can use the config.SetTRTDynamicShapeInfo(...) interface" - " to set the shape information to run the dynamic shape mode.")); + float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon")); + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + plugin::SkipLayerNormPluginDynamic* plugin = + new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size, + scale_size, eps, with_fp16); + layer = engine_->AddDynamicPlugin(inputs.data(), 2, plugin); } auto output_name = op_desc.Output("Out")[0]; diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc index 0bd2b8c9bf5eef2d2a9b45227cf09ae76ce3bb9a..2ab024dff327fda45faab01afbfbe38bb7244f93 100644 --- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc @@ -31,6 +31,12 @@ class SliceOpConverter : public OpConverter { // Declare inputs auto* input = engine_->GetITensor(op_desc.Input("Input")[0]); + if (op_desc.HasAttr("out_threshold")) { + float out_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); + engine_->SetTensorDynamicRange(input, out_scale); + } + std::vector axes = BOOST_GET_CONST(std::vector, op_desc.GetAttr("axes")); std::vector starts = @@ -38,15 +44,6 @@ class SliceOpConverter : public OpConverter { std::vector ends = BOOST_GET_CONST(std::vector, op_desc.GetAttr("ends")); - PADDLE_ENFORCE_EQ( - starts.size(), axes.size(), - platform::errors::InvalidArgument( - "The size of starts must be equal to the size of axes.")); - PADDLE_ENFORCE_EQ( - ends.size(), axes.size(), - platform::errors::InvalidArgument( - "The size of ends must be equal to the size of axes.")); - auto input_dims = input->getDimensions(); if (!engine_->with_dynamic_shape()) { // notice that input shape is [CHW] without batch axis when input has @@ -56,10 +53,6 @@ class SliceOpConverter : public OpConverter { } input_dims.d[0] = 1; // fake batchsize, not useful here for (size_t i = 0; i < axes.size(); i++) { - // split on batch is not supported in TensorRT - PADDLE_ENFORCE_NE(axes[i], 0, platform::errors::InvalidArgument( - "Invalid slice axis. Slice on batch " - "axis is not supported in TensorRT")); if (starts[i] < 0) { starts[i] = std::max(starts[i] + input_dims.d[axes[i]], 0); } @@ -90,14 +83,14 @@ class SliceOpConverter : public OpConverter { // bool ban_fp16 = engine_->disable_trt_plugin_fp16(); plugin::SpecialSlicePluginDynamic* plugin = new plugin::SpecialSlicePluginDynamic(); - layer = engine_->AddPluginV2(plugin_inputs.data(), plugin_inputs.size(), - plugin); + layer = engine_->AddDynamicPlugin(plugin_inputs.data(), + plugin_inputs.size(), plugin); } else { bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::SlicePluginDynamic* plugin = new plugin::SlicePluginDynamic(starts, ends, axes, with_fp16); - layer = engine_->AddPluginV2(&input, 1, plugin); + layer = engine_->AddDynamicPlugin(&input, 1, plugin); } #else PADDLE_THROW(platform::errors::Fatal( diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc index 79992065a22407ce09d7b64ab622eae993d36e9f..9cefb24751e18dfbb3b8283152cbcd58c81adc58 100644 --- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc @@ -51,6 +51,7 @@ class SoftMaxOpConverter : public OpConverter { uint32_t axes = std::max(0, input_dims - 3); // TODO(cryoco): Poor workaround. Fix padded dims problem when TRT layers // support Nd. + // Tips: Dynammic shape alreay fixes. int padded_dims = 0; int explicit_batch = 0; if (engine_->with_dynamic_shape()) explicit_batch = 1; @@ -62,16 +63,16 @@ class SoftMaxOpConverter : public OpConverter { } } if (!engine_->with_dynamic_shape()) { - if (axis == -1) { - axes = input_dims - 1 - padded_dims; + if (axis < 0) { + axes = input_dims + axis - padded_dims; } else { - axes = axis; + axes = axis - 1; } } else { - if (axis == -1) { - axes = input_dims - 1 - padded_dims; + if (axis < 0) { + axes = input_dims + axis; } else { - axes = axis + 1; + axes = axis; } } layer->setAxes(1 << axes); diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc index 768c6efaa6bd40529a509698e186fa66c2e8e711..47a6dd783a70cf7b4a8c3d7beb988fbc0f6a8786 100644 --- a/paddle/fluid/inference/tensorrt/convert/split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc @@ -33,17 +33,7 @@ class SplitOpConverter : public OpConverter { size_t output_num = op_desc.Output("Out").size(); // Get Attrs - PADDLE_ENFORCE_EQ(input_num, 1UL, - platform::errors::InvalidArgument( - "Invalid input X's size of split TRT converter. " - "Expected 1, received %d.", - input_num)); int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis")); - // split on batch is not supported in TensorRT - PADDLE_ENFORCE_NE( - axis, 0, - platform::errors::InvalidArgument( - "Invalid split axis. Split on batch is not supported in TensorRT")); std::vector output_lengths = BOOST_GET_CONST(std::vector, op_desc.GetAttr("sections")); @@ -90,7 +80,7 @@ class SplitOpConverter : public OpConverter { engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::SplitPluginDynamic* plugin = new plugin::SplitPluginDynamic(axis, output_lengths, with_fp16); - layer = engine_->AddPluginV2(&input, input_num, plugin); + layer = engine_->AddDynamicPlugin(&input, input_num, plugin); #else PADDLE_THROW(platform::errors::Fatal( "You are running the TRT Dynamic Shape mode, need to confirm that " @@ -101,7 +91,7 @@ class SplitOpConverter : public OpConverter { engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths, with_fp16); - layer = engine_->AddPlugin(&input, input_num, plugin); + layer = engine_->AddPluginV2Ext(&input, input_num, plugin); } std::string layer_name = "split (Output: "; diff --git a/paddle/fluid/inference/tensorrt/convert/stack_op.cc b/paddle/fluid/inference/tensorrt/convert/stack_op.cc index 1c971fa12e27e8706f5f68a3b1a5ffb34dbd4f40..6105e10799e5527e7c238b35bd8bb60f34e8d56f 100644 --- a/paddle/fluid/inference/tensorrt/convert/stack_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc @@ -45,6 +45,11 @@ class StackOpConverter : public OpConverter { for (int i = 0; i < input_num; ++i) { inputs[i] = engine_->GetITensor(input[i]); + if (op_desc.HasAttr("out_threshold")) { + float out_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); + engine_->SetTensorDynamicRange(inputs[i], out_scale); + } } int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis")); @@ -53,26 +58,19 @@ class StackOpConverter : public OpConverter { } nvinfer1::ILayer* layer = nullptr; - if (engine_->with_dynamic_shape()) { #if IS_TRT_VERSION_GE(6000) - bool with_fp16 = - engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); - plugin::StackPluginDynamic* plugin = - new plugin::StackPluginDynamic(axis, input_num, with_fp16); - layer = engine_->AddPluginV2(inputs, input_num, plugin); - assert(layer != nullptr); + bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + plugin::StackPluginDynamic* plugin = + new plugin::StackPluginDynamic(axis, input_num, with_fp16); + layer = engine_->AddDynamicPlugin(inputs, input_num, plugin); + PADDLE_ENFORCE_NOT_NULL( + layer, platform::errors::InvalidArgument( + "trt stack layer in converter could not be created.")); #else - PADDLE_THROW(platform::errors::Fatal( - "You are running the TRT Dynamic Shape mode, need to confirm that " - "your TRT version is no less than 6.0")); + PADDLE_THROW(platform::errors::Fatal( + "You are running the TRT Dynamic Shape mode, need to confirm that " + "your TRT version is no less than 6.0")); #endif - } else { - PADDLE_THROW(platform::errors::Fatal( - "You are running the Ernie(Bert) model in static" - "shape mode, which is not supported for the time being.\n" - "You can use the config.SetTRTDynamicShapeInfo(...) interface" - " to set the shape information to run the dynamic shape mode.")); - } auto output_name = op_desc.Output("Y").front(); RreplenishLayerAndOutput(layer, "stack", {output_name}, test_mode); free(inputs); diff --git a/paddle/fluid/inference/tensorrt/convert/swish_op.cc b/paddle/fluid/inference/tensorrt/convert/swish_op.cc index 25944a2fead6cdb3862abf9b33fb1bb57fa48953..b2e394d14eba23f025cf3f729fa0c815231110f5 100644 --- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc @@ -65,7 +65,7 @@ class SwishOpConverter : public OpConverter { engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::SwishPluginDynamic* plugin = new plugin::SwishPluginDynamic(beta, with_fp16); - layer = engine_->AddPluginV2(&input, input_num, plugin); + layer = engine_->AddDynamicPlugin(&input, input_num, plugin); #else PADDLE_THROW(platform::errors::Fatal( "You are running the TRT Dynamic Shape mode, need to confirm that " diff --git a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..2d12eaf736b754d623c2aa0e3c138a2ad80800b3 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc @@ -0,0 +1,79 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h" + +namespace paddle { +namespace framework { +class Scope; +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +class YoloBoxOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid yolo box op to tensorrt plugin"; + + framework::OpDesc op_desc(op, nullptr); + std::string X = op_desc.Input("X").front(); + std::string img_size = op_desc.Input("ImgSize").front(); + + auto* X_tensor = engine_->GetITensor(X); + auto* img_size_tensor = engine_->GetITensor(img_size); + + int class_num = BOOST_GET_CONST(int, op_desc.GetAttr("class_num")); + std::vector anchors = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("anchors")); + + int downsample_ratio = + BOOST_GET_CONST(int, op_desc.GetAttr("downsample_ratio")); + float conf_thresh = BOOST_GET_CONST(float, op_desc.GetAttr("conf_thresh")); + bool clip_bbox = BOOST_GET_CONST(bool, op_desc.GetAttr("clip_bbox")); + float scale_x_y = BOOST_GET_CONST(float, op_desc.GetAttr("scale_x_y")); + + int type_id = static_cast(engine_->WithFp16()); + auto input_dim = X_tensor->getDimensions(); + auto* yolo_box_plugin = new plugin::YoloBoxPlugin( + type_id ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, + anchors, class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, + input_dim.d[1], input_dim.d[2]); + + std::vector yolo_box_inputs; + yolo_box_inputs.push_back(X_tensor); + yolo_box_inputs.push_back(img_size_tensor); + + auto* yolo_box_layer = engine_->network()->addPluginV2( + yolo_box_inputs.data(), yolo_box_inputs.size(), *yolo_box_plugin); + + std::vector output_names; + output_names.push_back(op_desc.Output("Boxes").front()); + output_names.push_back(op_desc.Output("Scores").front()); + + RreplenishLayerAndOutput(yolo_box_layer, "yolo_box", output_names, + test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(yolo_box, YoloBoxOpConverter); diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 0bba4581ff90f931eba9399cb3b0b274342f4f16..99549fd6b5cbf96cf803e7f44b28c948daf0763d 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include #include -#include "cuda_runtime_api.h" +#include "cuda_runtime_api.h" // NOLINT #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/gpu_info.h" @@ -353,6 +353,13 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( return network()->addPluginExt(inputs, num_inputs, *plugin); } +nvinfer1::IPluginV2Layer *TensorRTEngine::AddPluginV2Ext( + nvinfer1::ITensor *const *inputs, int num_inputs, + plugin::PluginTensorRTV2Ext *plugin) { + owned_plugin_v2ext_.emplace_back(plugin); + return network()->addPluginV2(inputs, num_inputs, *plugin); +} + void TensorRTEngine::freshDeviceId() { int count; cudaGetDeviceCount(&count); diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 0e399578fa446793756a23e76013c3ed9a8bb9c4..2358e1ef976cdbc26eb907aff21b81f7e52d64d9 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -305,8 +305,14 @@ class TensorRTEngine { } int GetDeviceId() { return device_id_; } + nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs, int num_inputs, plugin::PluginTensorRT*); + + nvinfer1::IPluginV2Layer* AddPluginV2Ext(nvinfer1::ITensor* const* inputs, + int num_inputs, + plugin::PluginTensorRTV2Ext* plugin); + void SetTensorDynamicRange(nvinfer1::ITensor* tensor, float range) { quant_dynamic_range_[tensor] = range; } @@ -372,9 +378,9 @@ class TensorRTEngine { bool with_dynamic_shape() { return with_dynamic_shape_; } #if IS_TRT_VERSION_GE(6000) - nvinfer1::IPluginV2Layer* AddPluginV2(nvinfer1::ITensor* const* inputs, - int num_inputs, - plugin::DynamicPluginTensorRT* plugin) { + nvinfer1::IPluginV2Layer* AddDynamicPlugin( + nvinfer1::ITensor* const* inputs, int num_inputs, + plugin::DynamicPluginTensorRT* plugin) { owned_pluginv2_.emplace_back(plugin); return network()->addPluginV2(inputs, num_inputs, *plugin); } @@ -414,6 +420,7 @@ class TensorRTEngine { itensor_map_; std::vector> owned_plugin_; + std::vector> owned_plugin_v2ext_; // TensorRT related internal members template diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index 971f99e69197226bb7d7b26135f0b667f8ebdf30..6158fd130bad8d4df70fafb2a9f72c00e40217fd 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -60,6 +60,9 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) { static nvinfer1::IPluginRegistry* GetPluginRegistry() { return static_cast(dy::getPluginRegistry()); } +static int GetInferLibVersion() { + return static_cast(dy::getInferLibVersion()); +} #endif // A logger for create TensorRT infer builder. @@ -67,9 +70,12 @@ class NaiveLogger : public nvinfer1::ILogger { public: void log(nvinfer1::ILogger::Severity severity, const char* msg) override { switch (severity) { - case Severity::kINFO: + case Severity::kVERBOSE: VLOG(3) << msg; break; + case Severity::kINFO: + VLOG(2) << msg; + break; case Severity::kWARNING: LOG(WARNING) << msg; break; diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 052d17878a5a9dfb3c2b2cae4644b3d4dda2942f..54fc9492b7193e90245cce23538e34a4857cfe1f 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/inference/tensorrt/op_teller.h" #include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/data_layout.h" namespace paddle { namespace framework { @@ -41,6 +42,10 @@ struct SimpleOpTypeSetTeller : public Teller { teller_set.insert("multihead_matmul"); teller_set.insert("skip_layernorm"); teller_set.insert("slice"); + int8_teller_set.insert("fused_embedding_eltwise_layernorm"); + int8_teller_set.insert("multihead_matmul"); + int8_teller_set.insert("skip_layernorm"); + int8_teller_set.insert("slice"); #endif #if IS_TRT_VERSION_GE(7130) teller_set.insert("group_norm"); @@ -60,6 +65,8 @@ struct SimpleOpTypeSetTeller : public Teller { // use this set for no calib int8. std::unordered_set int8_teller_set{"mul", "conv2d", + "matmul", + "stack", "conv2d_fusion", "pool2d", "relu", @@ -95,6 +102,7 @@ struct SimpleOpTypeSetTeller : public Teller { "dropout", "prelu", "conv2d_transpose", + "depthwise_conv2d_transpose", "leaky_relu", "fc", "shuffle_channel", @@ -109,6 +117,12 @@ struct SimpleOpTypeSetTeller : public Teller { "transpose", "flatten2", "flatten", + "gather", + "yolo_box", + "roi_align", + "affine_channel", + "nearest_interp", + "anchor_generator", }; }; @@ -124,13 +138,95 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, return false; for (auto& teller : tellers_) { - if (op_type == "pool2d" || op_type == "conv2d" || - op_type == "depthwise_conv2d" || op_type == "conv2d_transpose") { + if (op_type == "depthwise_conv2d") { std::vector paddings = BOOST_GET_CONST(std::vector, desc.GetAttr("paddings")); if (paddings.size() > 2) return false; } + + if (op_type == "pool2d") { + std::vector paddings = + BOOST_GET_CONST(std::vector, desc.GetAttr("paddings")); + if (paddings.size() > 2) return false; + if (desc.Input("X").size() != 1) { + VLOG(3) << "TRT Pool2d expect 1 input, but got " + << desc.Input("X").size(); + return false; + } + if (desc.Output("Out").size() != 1) { + VLOG(3) << "TRT Pool2d has only 1 output, but got " + << desc.Output("Out").size(); + return false; + } + if (!desc.HasAttr("pooling_type")) { + return false; + } else { + std::string pool_type = + BOOST_GET_CONST(std::string, desc.GetAttr("pooling_type")); + if (pool_type != "max" && pool_type != "avg") { + VLOG(3) << "Wrong pool op type, the trt do not support the " + << pool_type << " pool type."; + return false; + } + } + } + + if (op_type == "conv2d" || op_type == "conv2d_transpose" || + op_type == "conv2d_fusion" || op_type == "depthwise_conv2d" || + op_type == "depthwise_conv2d_transpose") { + std::vector paddings = + BOOST_GET_CONST(std::vector, desc.GetAttr("paddings")); + + // conv2d and conv2d_transpose need padding check + if (paddings.size() > 2 && op_type != "conv2d_fusion") return false; + + if (desc.Input("Input").size() != 1) { + VLOG(3) << "TRT Conv2d expect 1 input, but got " + << desc.Input("Input").size() << " input."; + return false; + } + + if (desc.Input("Filter").size() != 1) { + VLOG(3) << "TRT Conv2d expect 1 filter, but got " + << desc.Input("Filter").size() << " filter."; + return false; + } + + if (desc.HasAttr("enable_int8")) { + if (op_type == "conv2d" || op_type == "conv2d_fusion") { + if (!desc.HasAttr("Input_scale")) { + VLOG(3) << "Input scale not found. TRT int8" + " requires conv/deconv to have " + "input quantization scales."; + return false; + } + } + } + + if (op_type == "conv2d_transpose" || + op_type == "depthwise_conv2d_transpose") { + if (!desc.HasAttr("dilations")) { + return false; + } else { + const std::vector dilations = + BOOST_GET_CONST(std::vector, desc.GetAttr("dilations")); + if (dilations[0] != 1 || dilations[1] != 1) { + VLOG(3) << "In conv2d_transpose, Dilations must be (1, 1) for " + "tensorRT, but given (" + << dilations[0] << ", " << dilations[1] << ")"; + return false; + } + } + } + + if (desc.Output("Output").size() != 1) { + VLOG(3) << "TRT Conv2d expect 1 output, but got " + << desc.Output("Output").size() << " output."; + return false; + } + } + if (op_type == "matmul") { auto* block = desc.Block(); for (auto& param_name : desc.Inputs()) { @@ -138,7 +234,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, auto* var_desc = block->FindVar(var_name); const auto shape = var_desc->GetShape(); if (shape.size() < 3) { - VLOG(1) + VLOG(3) << "matmul op dims < 3 not supported in tensorrt, but got dims " << shape.size() << ", so jump it."; return false; @@ -159,7 +255,11 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, return false; } else { int axis = BOOST_GET_CONST(int, desc.GetAttr("axis")); - if (axis <= 0) return false; + if (with_dynamic_shape) { + if (axis < 0) return false; + } else { + if (axis <= 0) return false; + } } } if (op_type == "transpose2" || op_type == "transpose") { @@ -172,7 +272,18 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (axis.size() >= nvinfer1::Dims::MAX_DIMS) return false; } } - if (op_type == "flatten2" || op_type == "flatten") { + if (op_type == "flatten2") { + // flatten doesn't support dynamic shape currently + if (!desc.HasAttr("axis")) { + return false; + } else { + if (with_dynamic_shape) return false; + int axis = BOOST_GET_CONST(int, desc.GetAttr("axis")); + if (axis != 1) return false; + } + } + + if (op_type == "flatten") { // flatten doesn't support dynamic shape currently if (!desc.HasAttr("axis")) { return false; @@ -182,6 +293,360 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (axis != 1) return false; } } + + if (op_type == "gather") { + // current not support axis from input, use default 0 + if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false; + } + + if (op_type == "yolo_box") { + if (with_dynamic_shape) return false; + bool has_attrs = + (desc.HasAttr("class_num") && desc.HasAttr("anchors") && + desc.HasAttr("downsample_ratio") && desc.HasAttr("conf_thresh") && + desc.HasAttr("clip_bbox") && desc.HasAttr("scale_x_y")); + if (!has_attrs) return false; + } + + if (op_type == "affine_channel") { + if (!desc.HasAttr("data_layout")) return false; + auto data_layout = framework::StringToDataLayout( + BOOST_GET_CONST(std::string, desc.GetAttr("data_layout"))); + if (data_layout != framework::DataLayout::kNCHW) return false; + } + + if (op_type == "multiclass_nms") { + if (with_dynamic_shape) return false; + auto* block = desc.Block(); + for (auto& param_name : desc.Inputs()) { + for (auto& var_name : param_name.second) { + auto* var_desc = block->FindVar(var_name); + const auto shape = var_desc->GetShape(); + if (shape.size() != 3) { + VLOG(3) << "multiclass_nms op dims != 3 not supported in tensorrt, " + "but got dims " + << shape.size() << ", so jump it."; + return false; + } + } + } + bool has_attrs = + (desc.HasAttr("background_label") && + desc.HasAttr("score_threshold") && desc.HasAttr("nms_top_k") && + desc.HasAttr("keep_top_k") && desc.HasAttr("normalized")); + if (has_attrs == false) return false; + + auto nms_top_k = BOOST_GET_CONST(int, desc.GetAttr("nms_top_k")); + if (nms_top_k < 0) return false; + + auto keep_top_k = BOOST_GET_CONST(int, desc.GetAttr("keep_top_k")); + if (keep_top_k < 0) return false; + + auto registry = GetPluginRegistry(); + if (registry == nullptr) return false; + } + + if (op_type == "nearest_interp") { + std::vector attrs{"data_layout", "interp_method", + "align_corners", "scale", + "out_h", "out_w"}; + for (auto const attr : attrs) { + if (!desc.HasAttr(attr)) return false; + } + auto data_layout = framework::StringToDataLayout( + BOOST_GET_CONST(std::string, desc.GetAttr("data_layout"))); + if (data_layout != framework::DataLayout::kNCHW && + data_layout != framework::DataLayout::kNHWC) + return false; + auto interp_method = + BOOST_GET_CONST(std::string, desc.GetAttr("interp_method")); + if (interp_method != "nearest") return false; + + if (!desc.HasAttr("scale") || !desc.HasAttr("out_h") || + !desc.HasAttr("out_w")) { + return false; + } else { + auto scale = BOOST_GET_CONST(float, desc.GetAttr("scale")); + auto out_h = BOOST_GET_CONST(int, desc.GetAttr("out_h")); + auto out_w = BOOST_GET_CONST(int, desc.GetAttr("out_w")); + if (!(scale > 0.f && (out_h <= 0 && out_w <= 0))) { + if (out_h <= 0) { + VLOG(3) << "out_h must be greater than 0 if scale is not set."; + return false; + } + if (out_w <= 0) { + VLOG(3) << "out_w must be greater than 0 if scale is not set."; + return false; + } + } + } + } + + if (op_type == "roi_align") { + if (!with_dynamic_shape) return false; + + std::vector attrs{"pooled_height", "pooled_width", + "spatial_scale", "sampling_ratio"}; + for (auto const attr : attrs) { + if (!desc.HasAttr(attr)) return false; + } + + const auto pooled_height = + BOOST_GET_CONST(int, desc.GetAttr("pooled_height")); + if (pooled_height <= 0) return false; + + const auto pooled_width = + BOOST_GET_CONST(int, desc.GetAttr("pooled_width")); + if (pooled_width <= 0) return false; + + const auto spatial_scale = + BOOST_GET_CONST(float, desc.GetAttr("spatial_scale")); + if (spatial_scale <= 0.f) return false; + } + + if (op_type == "hard_swish") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "HardSwish op has only 1 input, but got " + << desc.Input("X").size(); + return false; + } + + if (desc.Output("Out").size() != 1) { + VLOG(3) << "HardSwish op has only 1 output, but got " + << desc.Output("Out").size(); + return false; + } + } + + if (op_type == "batch_norm") { + const std::vector bn_inputs = {"X", "Bias", "Mean", "Scale", + "Variance"}; + for (unsigned int i = 0; i < bn_inputs.size(); i++) { + if (desc.Input(bn_inputs[i]).size() != 1) { + VLOG(3) << "Invalid " << bn_inputs[i] + << "'s size of batch_norm TRT " + "converter. Expected 1, received " + << desc.Input(bn_inputs[i]).size() << "."; + return false; + } + } + + if (desc.Output("Y").size() != 1) { + VLOG(3) << "Invalid output Y's size of batch_norm TRT " + "converter. Expected 1, received " + << desc.Output("Y").size() << "."; + return false; + } + } + + if (op_type == "split") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "Invalid input X's size of split TRT converter. " + "Expected 1, received " + << desc.Input("X").size() << "."; + return false; + } + if (!desc.HasAttr("axis")) { + return false; + } else { + int axis = BOOST_GET_CONST(int, desc.GetAttr("axis")); + if (axis == 0) { + VLOG(3) << "Invalid split axis. Split on batch is not supported in " + "TensorRT"; + return false; + } + } + } + + if (op_type == "slice") { + if (!desc.HasAttr("axes") || !desc.HasAttr("starts") || + !desc.HasAttr("ends")) { + return false; + } else { + std::vector axes = + BOOST_GET_CONST(std::vector, desc.GetAttr("axes")); + std::vector starts = + BOOST_GET_CONST(std::vector, desc.GetAttr("starts")); + std::vector ends = + BOOST_GET_CONST(std::vector, desc.GetAttr("ends")); + if (axes.size() != starts.size() || axes.size() != ends.size()) { + return false; + } + if (!with_dynamic_shape) { + for (size_t i = 0; i < axes.size(); i++) { + if (axes[i] == 0) { + VLOG(3) << "Invalid slice axis. Slice on batch axis is not " + "supported in TensorRT"; + return false; + } + } + } + } + } + + if (op_type == "elementwise_add" || op_type == "elementwise_mul") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "The input op's Input(\"X\").size() " + "should equal to 1, but received Input(\"X\").size() = " + << desc.Input("X").size() << "."; + return false; + } + if (desc.Input("Y").size() != 1) { + VLOG(3) << "The input op's Input(\"Y\").size() " + "should equal to 1, but received Input(\"Y\").size() = " + << desc.Input("Y").size() << "."; + return false; + } + if (desc.Output("Out").size() != 1) { + VLOG(3) << "The input op's Output(\"Out\").size() " + "should equal to 1, but reveceid Output(\"Out\").size() = " + << desc.Output("Out").size() << "."; + return false; + } + } + + if (op_type == "stack") { + if (!with_dynamic_shape) { + VLOG(3) + << "static shape mode is not supported for TRT stack.\n" + "You can use the config.SetTRTDynamicShapeInfo(...) interface" + " to set the shape information to run the dynamic shape " + "mode."; + return false; + } + } + + if (op_type == "fused_embedding_eltwise_layernorm") { + if (!with_dynamic_shape) { + VLOG(3) << "fused_embedding_eltwise_layernorm should run on dynamic " + "shape mode."; + return false; + } + if (desc.Input("Ids").size() != desc.Input("Embs").size()) { + VLOG(3) << "The id and emb size of fused EmbEltwiseLayerNormOp " + "should be same "; + return false; + } + } + + if (op_type == "gelu") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "gelu op has only 1 input, but got " + << desc.Input("X").size(); + return false; + } + if (desc.Output("Out").size() != 1) { + VLOG(3) << "gelu op has only 1 output, but got " + << desc.Output("Out").size(); + return false; + } + } + + if (op_type == "layer_norm") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "input of layer_norm op converter should be 1, got " + << desc.Input("X").size(); + return false; + } + if (desc.Input("Bias").size() != 1) { + VLOG(3) << "Bias of layer_norm op converter should be 1, got " + << desc.Input("Bias").size(); + return false; + } + if (desc.Input("Scale").size() != 1) { + VLOG(3) << "Scale of layer_norm op converter should be 1, got " + << desc.Input("Scale").size(); + return false; + } + if (desc.Output("Y").size() != 1) { + VLOG(3) << "output of layer_norm op converter should be 1, got " + << desc.Output("Y").size(); + return false; + } + } + + if (op_type == "leaky_relu") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "Invalid number of TRT leaky_relu op converter " + "inputs. Expected 1, but received " + << desc.Input("X").size(); + return false; + } + if (desc.Output("Out").size() != 1) { + VLOG(3) << "output of leaky_relu op converter should be 1, got " + << desc.Output("Out").size(); + return false; + } + } + + if (op_type == "pad") { + const float pad_value = BOOST_GET_CONST(float, desc.GetAttr("pad_value")); + if (pad_value != 0.0f) { + VLOG(3) << "The pad layer of TRT only support zero."; + return false; + } + } + + if (op_type == "prelu") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "Invalid input X's size of prelu TRT converter. " + "Expected 1, received " + << desc.Input("X").size() << "."; + return false; + } + if (desc.Output("Out").size() != 1) { + VLOG(3) << "Invalid output Out's size of prelu TRT converter. " + "Expected 1, received " + << desc.Output("Out").size() << "."; + return false; + } + } + + if (op_type == "roi_align") { + if (!with_dynamic_shape) { + VLOG(3) << "TRT roi align plugin only accept the dynamic shape, " + "because that " + "the roi_align will change the batch size."; + return false; + } + } + + if (op_type == "shuffle_channel") { + if (with_dynamic_shape) { + VLOG(3) << "You are running the TRT Dynamic Shape mode, " + "the shuffle_channel op does not support dynamic shape yet"; + return false; + } + } + + if (op_type == "skip_layernorm") { + if (!with_dynamic_shape) { + VLOG(3) << "the skip_layernorm does not support static shape yet"; + return false; + } + } + + if (op_type == "multihead_matmul") { + if (!with_dynamic_shape) { + VLOG(3) << "the multihead_matmul does not support static shape yet"; + return false; + } + } + + if (op_type == "fc") { + int x_num_col_dims = + desc.HasAttr("x_num_col_dims") + ? BOOST_GET_CONST(int, desc.GetAttr("x_num_col_dims")) + : (desc.HasAttr("in_num_col_dims") + ? BOOST_GET_CONST(int, desc.GetAttr("in_num_col_dims")) + : 1); + if (x_num_col_dims < 1) { + VLOG(3) << "converter expects x_num_col_dims >= 1, " + "but x_num_col_dims = %d."; + return false; + } + } + if ((*teller)(op_type, desc, use_no_calib_int8)) return true; } return false; diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index e37beb3b8e5c3680eda481009699091dcc1ee7a3..1804e6c5571d3a15b0b9adc67dc535b46635caa8 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -5,4 +5,10 @@ nv_library(tensorrt_plugin instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu hard_swish_op_plugin.cu stack_op_plugin.cu special_slice_plugin.cu + anchor_generator_op_plugin.cu + yolo_box_op_plugin.cu + roi_align_op_plugin.cu DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) + +nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS + paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_plugin) diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu new file mode 100644 index 0000000000000000000000000000000000000000..01ee86ceb48a9ef022ba73fe0dbdab4a52324cc6 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu @@ -0,0 +1,566 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" + +#include "paddle/fluid/operators/detection/anchor_generator_op.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +#define PrepareParamsOnDevice() \ + constexpr int data_size = 4; \ + cudaMalloc(&anchor_sizes_device_, anchor_sizes_.size() * data_size); \ + cudaMalloc(&aspect_ratios_device_, aspect_ratios_.size() * data_size); \ + cudaMalloc(&stride_device_, stride_.size() * data_size); \ + cudaMalloc(&variances_device_, variances_.size() * data_size); \ + cudaMemcpy(anchor_sizes_device_, anchor_sizes_.data(), \ + anchor_sizes_.size() * data_size, cudaMemcpyHostToDevice); \ + cudaMemcpy(aspect_ratios_device_, aspect_ratios_.data(), \ + aspect_ratios_.size() * data_size, cudaMemcpyHostToDevice); \ + cudaMemcpy(stride_device_, stride_.data(), stride_.size() * data_size, \ + cudaMemcpyHostToDevice); \ + cudaMemcpy(variances_device_, variances_.data(), \ + variances_.size() * data_size, cudaMemcpyHostToDevice); + +AnchorGeneratorPlugin::AnchorGeneratorPlugin( + const nvinfer1::DataType data_type, const std::vector& anchor_sizes, + const std::vector& aspect_ratios, const std::vector& stride, + const std::vector& variances, const float offset, const int height, + const int width, const int num_anchors, const int box_num) + : data_type_(data_type), + anchor_sizes_(anchor_sizes), + aspect_ratios_(aspect_ratios), + stride_(stride), + variances_(variances), + offset_(offset), + height_(height), + width_(width), + num_anchors_(num_anchors), + box_num_(box_num) { + // anchors must be float32, which is the generator proposals' input + PADDLE_ENFORCE_EQ(data_type_, nvinfer1::DataType::kFLOAT, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts float32.")); + PADDLE_ENFORCE_GE(height_, 0, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts height " + "greater than 0, but receive height = %d.", + height_)); + PADDLE_ENFORCE_GE(width_, 0, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts width " + "greater than 0, but receive width = %d.", + width_)); + PADDLE_ENFORCE_GE( + num_anchors_, 0, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts number of anchors greater " + "than 0, but receive number of anchors = %d.", + num_anchors_)); + PADDLE_ENFORCE_GE(box_num_, 0, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts box_num " + "greater than 0, but receive box_num = %d.", + box_num_)); + PrepareParamsOnDevice(); +} + +AnchorGeneratorPlugin::~AnchorGeneratorPlugin() { + auto release_device_ptr = [](void* ptr) { + if (ptr) { + cudaFree(ptr); + ptr = nullptr; + } + }; + release_device_ptr(anchor_sizes_device_); + release_device_ptr(aspect_ratios_device_); + release_device_ptr(stride_device_); + release_device_ptr(variances_device_); +} + +AnchorGeneratorPlugin::AnchorGeneratorPlugin(const void* data, size_t length) { + DeserializeValue(&data, &length, &data_type_); + DeserializeValue(&data, &length, &anchor_sizes_); + DeserializeValue(&data, &length, &aspect_ratios_); + DeserializeValue(&data, &length, &stride_); + DeserializeValue(&data, &length, &variances_); + DeserializeValue(&data, &length, &offset_); + DeserializeValue(&data, &length, &height_); + DeserializeValue(&data, &length, &width_); + DeserializeValue(&data, &length, &num_anchors_); + DeserializeValue(&data, &length, &box_num_); + PrepareParamsOnDevice(); +} + +const char* AnchorGeneratorPlugin::getPluginType() const { + return "anchor_generator_plugin"; +} + +const char* AnchorGeneratorPlugin::getPluginVersion() const { return "1"; } + +int AnchorGeneratorPlugin::getNbOutputs() const { return 2; } + +nvinfer1::Dims AnchorGeneratorPlugin::getOutputDimensions( + int index, const nvinfer1::Dims* inputs, int nb_input_dims) { + nvinfer1::Dims dims{}; + dims.nbDims = 4; + dims.d[0] = height_; + dims.d[1] = width_; + dims.d[2] = num_anchors_; + dims.d[3] = 4; + return dims; +} + +bool AnchorGeneratorPlugin::supportsFormat( + nvinfer1::DataType type, nvinfer1::TensorFormat format) const { + // static shape plugin can't support different type between input/out + // it may cause addition overhead in half mode + return (type == data_type_ && format == nvinfer1::TensorFormat::kLINEAR); +} + +size_t AnchorGeneratorPlugin::getWorkspaceSize(int max_batch_size) const { + return 0; +} + +template +int AnchorGeneratorPlugin::enqueue_impl(int batch_size, + const void* const* inputs, + void** outputs, void* workspace, + cudaStream_t stream) { + const int block = 512; + const int gen_anchor_grid = (box_num_ + block - 1) / block; + T* anchors = static_cast(outputs[0]); + T* vars = static_cast(outputs[1]); + const T* anchor_sizes_device = static_cast(anchor_sizes_device_); + const T* aspect_ratios_device = static_cast(aspect_ratios_device_); + const T* stride_device = static_cast(stride_device_); + const T* variances_device = static_cast(variances_device_); + paddle::operators::GenAnchors<<>>( + anchors, aspect_ratios_device, aspect_ratios_.size(), anchor_sizes_device, + anchor_sizes_.size(), stride_device, stride_.size(), height_, width_, + offset_); + const int var_grid = (box_num_ * 4 + block - 1) / block; + paddle::operators::SetVariance<<>>( + vars, variances_device, variances_.size(), box_num_ * 4); + return cudaGetLastError() != cudaSuccess; +} + +int AnchorGeneratorPlugin::enqueue(int batch_size, const void* const* inputs, + void** outputs, void* workspace, + cudaStream_t stream) { + return enqueue_impl(batch_size, inputs, outputs, workspace, stream); +} + +int AnchorGeneratorPlugin::initialize() { return 0; } + +void AnchorGeneratorPlugin::terminate() {} + +size_t AnchorGeneratorPlugin::getSerializationSize() const { + size_t serialize_size = 0; + serialize_size += SerializedSize(data_type_); + serialize_size += SerializedSize(anchor_sizes_); + serialize_size += SerializedSize(aspect_ratios_); + serialize_size += SerializedSize(stride_); + serialize_size += SerializedSize(variances_); + serialize_size += SerializedSize(offset_); + serialize_size += SerializedSize(height_); + serialize_size += SerializedSize(width_); + serialize_size += SerializedSize(num_anchors_); + serialize_size += SerializedSize(box_num_); + return serialize_size; +} + +void AnchorGeneratorPlugin::serialize(void* buffer) const { + SerializeValue(&buffer, data_type_); + SerializeValue(&buffer, anchor_sizes_); + SerializeValue(&buffer, aspect_ratios_); + SerializeValue(&buffer, stride_); + SerializeValue(&buffer, variances_); + SerializeValue(&buffer, offset_); + SerializeValue(&buffer, height_); + SerializeValue(&buffer, width_); + SerializeValue(&buffer, num_anchors_); + SerializeValue(&buffer, box_num_); +} + +void AnchorGeneratorPlugin::destroy() {} + +void AnchorGeneratorPlugin::setPluginNamespace(const char* lib_namespace) { + namespace_ = std::string(lib_namespace); +} + +const char* AnchorGeneratorPlugin::getPluginNamespace() const { + return namespace_.c_str(); +} + +nvinfer1::DataType AnchorGeneratorPlugin::getOutputDataType( + int index, const nvinfer1::DataType* input_type, int nb_inputs) const { + return data_type_; +} + +bool AnchorGeneratorPlugin::isOutputBroadcastAcrossBatch( + int output_index, const bool* input_is_broadcast, int nb_inputs) const { + return true; +} + +bool AnchorGeneratorPlugin::canBroadcastInputAcrossBatch( + int input_index) const { + return false; +} + +void AnchorGeneratorPlugin::configurePlugin( + const nvinfer1::Dims* input_dims, int nb_inputs, + const nvinfer1::Dims* output_dims, int nb_outputs, + const nvinfer1::DataType* input_types, + const nvinfer1::DataType* output_types, const bool* input_is_broadcast, + const bool* output_is_broadcast, nvinfer1::PluginFormat float_format, + int max_batct_size) {} + +nvinfer1::IPluginV2Ext* AnchorGeneratorPlugin::clone() const { + auto plugin = new AnchorGeneratorPlugin( + data_type_, anchor_sizes_, aspect_ratios_, stride_, variances_, offset_, + height_, width_, num_anchors_, box_num_); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} + +void AnchorGeneratorPluginCreator::setPluginNamespace( + const char* lib_namespace) { + namespace_ = std::string(lib_namespace); +} + +const char* AnchorGeneratorPluginCreator::getPluginNamespace() const { + return namespace_.c_str(); +} + +const char* AnchorGeneratorPluginCreator::getPluginName() const { + return "anchor_generator_plugin"; +} + +const char* AnchorGeneratorPluginCreator::getPluginVersion() const { + return "1"; +} + +const nvinfer1::PluginFieldCollection* +AnchorGeneratorPluginCreator::getFieldNames() { + return &field_collection_; +} + +nvinfer1::IPluginV2Ext* AnchorGeneratorPluginCreator::createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) { + const nvinfer1::PluginField* fields = fc->fields; + int type_id = -1; + std::vector anchor_sizes, aspect_ratios, stride, variances; + float offset = .5; + int height = -1, width = -1; + int num_anchors = -1; + int box_num = -1; + + for (int i = 0; i < fc->nbFields; ++i) { + const std::string field_name(fc->fields[i].name); + const auto length = fc->fields[i].length; + if (field_name.compare("type_id") == 0) { + type_id = *static_cast(fc->fields[i].data); + } else if (field_name.compare("anchor_sizes")) { + const auto* data = static_cast(fc->fields[i].data); + anchor_sizes.insert(anchor_sizes.end(), data, data + length); + } else if (field_name.compare("aspect_ratios")) { + const auto* data = static_cast(fc->fields[i].data); + aspect_ratios.insert(aspect_ratios.end(), data, data + length); + } else if (field_name.compare("stride")) { + const auto* data = static_cast(fc->fields[i].data); + stride.insert(stride.end(), data, data + length); + } else if (field_name.compare("variances")) { + const auto* data = static_cast(fc->fields[i].data); + variances.insert(variances.end(), data, data + length); + } else if (field_name.compare("offset")) { + offset = *static_cast(fc->fields[i].data); + } else if (field_name.compare("height")) { + height = *static_cast(fc->fields[i].data); + } else if (field_name.compare("width")) { + width = *static_cast(fc->fields[i].data); + } else if (field_name.compare("num_anchors")) { + num_anchors = *static_cast(fc->fields[i].data); + } else if (field_name.compare("box_num")) { + box_num = *static_cast(fc->fields[i].data); + } else { + assert(false && "unknown plugin field name."); + } + } + return new AnchorGeneratorPlugin(nvinfer1::DataType::kFLOAT, anchor_sizes, + aspect_ratios, stride, variances, offset, + height, width, num_anchors, box_num); +} + +nvinfer1::IPluginV2Ext* AnchorGeneratorPluginCreator::deserializePlugin( + const char* name, const void* serial_data, size_t serial_length) { + auto plugin = new AnchorGeneratorPlugin(serial_data, serial_length); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} + +#if IS_TRT_VERSION_GE(6000) +AnchorGeneratorPluginDynamic::AnchorGeneratorPluginDynamic( + const nvinfer1::DataType data_type, const std::vector& anchor_sizes, + const std::vector& aspect_ratios, const std::vector& stride, + const std::vector& variances, const float offset, + const int num_anchors) + : data_type_(data_type), + anchor_sizes_(anchor_sizes), + aspect_ratios_(aspect_ratios), + stride_(stride), + variances_(variances), + offset_(offset), + num_anchors_(num_anchors) { + // data_type_ is used to determine the output data type + // data_type_ can only be float32 + // height, width, num_anchors are calculated at configurePlugin + PADDLE_ENFORCE_EQ(data_type_, nvinfer1::DataType::kFLOAT, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts float32.")); + PADDLE_ENFORCE_GE( + num_anchors_, 0, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts number of anchors greater " + "than 0, but receive number of anchors = %d.", + num_anchors_)); + PrepareParamsOnDevice(); +} + +AnchorGeneratorPluginDynamic::~AnchorGeneratorPluginDynamic() { + auto release_device_ptr = [](void* ptr) { + if (ptr) { + cudaFree(ptr); + ptr = nullptr; + } + }; + release_device_ptr(anchor_sizes_device_); + release_device_ptr(aspect_ratios_device_); + release_device_ptr(stride_device_); + release_device_ptr(variances_device_); +} + +AnchorGeneratorPluginDynamic::AnchorGeneratorPluginDynamic(void const* data, + size_t length) { + DeserializeValue(&data, &length, &data_type_); + DeserializeValue(&data, &length, &anchor_sizes_); + DeserializeValue(&data, &length, &aspect_ratios_); + DeserializeValue(&data, &length, &stride_); + DeserializeValue(&data, &length, &variances_); + DeserializeValue(&data, &length, &offset_); + DeserializeValue(&data, &length, &num_anchors_); + PrepareParamsOnDevice(); +} + +nvinfer1::IPluginV2DynamicExt* AnchorGeneratorPluginDynamic::clone() const { + auto plugin = new AnchorGeneratorPluginDynamic( + data_type_, anchor_sizes_, aspect_ratios_, stride_, variances_, offset_, + num_anchors_); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} + +nvinfer1::DimsExprs AnchorGeneratorPluginDynamic::getOutputDimensions( + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) { + nvinfer1::DimsExprs ret{}; + ret.nbDims = 4; + ret.d[0] = inputs[0].d[2]; // feature height + ret.d[1] = inputs[0].d[3]; // feature width + ret.d[2] = exprBuilder.constant(num_anchors_); + ret.d[3] = exprBuilder.constant(4); + return ret; +} + +bool AnchorGeneratorPluginDynamic::supportsFormatCombination( + int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, + int nbOutputs) { + // input can be any, doesn't matter + // anchor generator doesn't read input raw data, only need the shape info + auto type = inOut[pos].type; + auto format = inOut[pos].format; +#if IS_TRT_VERSION_GE(7234) + if (pos == 0) return true; +#else + if (pos == 0) return format == nvinfer1::TensorFormat::kLINEAR; +#endif + return (type == nvinfer1::DataType::kFLOAT && + format == nvinfer1::TensorFormat::kLINEAR); +} + +void AnchorGeneratorPluginDynamic::configurePlugin( + const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {} + +size_t AnchorGeneratorPluginDynamic::getWorkspaceSize( + const nvinfer1::PluginTensorDesc* inputs, int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const { + return 0; +} + +template +int AnchorGeneratorPluginDynamic::enqueue_impl( + const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, + void* const* outputs, void* workspace, cudaStream_t stream) { + const int height = inputDesc[0].dims.d[2]; + const int width = inputDesc[0].dims.d[3]; + const int box_num = height * width * num_anchors_; + const int block = 512; + const int gen_anchor_grid = (box_num + block - 1) / block; + T* anchors = static_cast(outputs[0]); + T* vars = static_cast(outputs[1]); + const T* anchor_sizes_device = static_cast(anchor_sizes_device_); + const T* aspect_ratios_device = static_cast(aspect_ratios_device_); + const T* stride_device = static_cast(stride_device_); + const T* variances_device = static_cast(variances_device_); + paddle::operators::GenAnchors<<>>( + anchors, aspect_ratios_device, aspect_ratios_.size(), anchor_sizes_device, + anchor_sizes_.size(), stride_device, stride_.size(), height, width, + offset_); + const int var_grid = (box_num * 4 + block - 1) / block; + paddle::operators::SetVariance<<>>( + vars, variances_device, variances_.size(), box_num * 4); + return cudaGetLastError() != cudaSuccess; +} + +int AnchorGeneratorPluginDynamic::enqueue( + const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, + void* const* outputs, void* workspace, cudaStream_t stream) { + assert(outputDesc[0].type == nvinfer1::DataType::kFLOAT); + assert(outputDesc[1].type == nvinfer1::DataType::kFLOAT); + return enqueue_impl(inputDesc, outputDesc, inputs, outputs, workspace, + stream); +} + +nvinfer1::DataType AnchorGeneratorPluginDynamic::getOutputDataType( + int index, const nvinfer1::DataType* inputTypes, int nbInputs) const { + return data_type_; +} + +const char* AnchorGeneratorPluginDynamic::getPluginType() const { + return "anchor_generator_plugin_dynamic"; +} + +int AnchorGeneratorPluginDynamic::getNbOutputs() const { return 2; } + +int AnchorGeneratorPluginDynamic::initialize() { return 0; } + +void AnchorGeneratorPluginDynamic::terminate() {} + +size_t AnchorGeneratorPluginDynamic::getSerializationSize() const { + size_t serialize_size = 0; + serialize_size += SerializedSize(data_type_); + serialize_size += SerializedSize(anchor_sizes_); + serialize_size += SerializedSize(aspect_ratios_); + serialize_size += SerializedSize(stride_); + serialize_size += SerializedSize(variances_); + serialize_size += SerializedSize(offset_); + serialize_size += SerializedSize(num_anchors_); + return serialize_size; +} + +void AnchorGeneratorPluginDynamic::serialize(void* buffer) const { + SerializeValue(&buffer, data_type_); + SerializeValue(&buffer, anchor_sizes_); + SerializeValue(&buffer, aspect_ratios_); + SerializeValue(&buffer, stride_); + SerializeValue(&buffer, variances_); + SerializeValue(&buffer, offset_); + SerializeValue(&buffer, num_anchors_); +} + +void AnchorGeneratorPluginDynamic::destroy() {} + +void AnchorGeneratorPluginDynamicCreator::setPluginNamespace( + const char* lib_namespace) { + namespace_ = std::string(lib_namespace); +} + +const char* AnchorGeneratorPluginDynamicCreator::getPluginNamespace() const { + return namespace_.c_str(); +} + +const char* AnchorGeneratorPluginDynamicCreator::getPluginName() const { + return "anchor_generator_plugin_dynamic"; +} + +const char* AnchorGeneratorPluginDynamicCreator::getPluginVersion() const { + return "1"; +} + +const nvinfer1::PluginFieldCollection* +AnchorGeneratorPluginDynamicCreator::getFieldNames() { + return &field_collection_; +} + +nvinfer1::IPluginV2Ext* AnchorGeneratorPluginDynamicCreator::createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) { + const nvinfer1::PluginField* fields = fc->fields; + int type_id = -1; + std::vector anchor_sizes, aspect_ratios, stride, variances; + float offset = .5; + int num_anchors = -1; + for (int i = 0; i < fc->nbFields; ++i) { + const std::string field_name(fc->fields[i].name); + const auto length = fc->fields[i].length; + if (field_name.compare("type_id") == 0) { + type_id = *static_cast(fc->fields[i].data); + } else if (field_name.compare("anchor_sizes")) { + const auto* data = static_cast(fc->fields[i].data); + anchor_sizes.insert(anchor_sizes.end(), data, data + length); + } else if (field_name.compare("aspect_ratios")) { + const auto* data = static_cast(fc->fields[i].data); + aspect_ratios.insert(aspect_ratios.end(), data, data + length); + } else if (field_name.compare("stride")) { + const auto* data = static_cast(fc->fields[i].data); + stride.insert(stride.end(), data, data + length); + } else if (field_name.compare("variances")) { + const auto* data = static_cast(fc->fields[i].data); + variances.insert(variances.end(), data, data + length); + } else if (field_name.compare("offset")) { + offset = *static_cast(fc->fields[i].data); + } else if (field_name.compare("num_anchors")) { + num_anchors = *static_cast(fc->fields[i].data); + } else { + assert(false && "unknown plugin field name."); + } + } + return new AnchorGeneratorPluginDynamic(nvinfer1::DataType::kFLOAT, + anchor_sizes, aspect_ratios, stride, + variances, offset, num_anchors); +} + +nvinfer1::IPluginV2Ext* AnchorGeneratorPluginDynamicCreator::deserializePlugin( + const char* name, const void* serial_data, size_t serial_length) { + auto plugin = new AnchorGeneratorPluginDynamic(serial_data, serial_length); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} +#endif + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h new file mode 100644 index 0000000000000000000000000000000000000000..aff0b6a6802f114a25acf32627a39ca42d572d7c --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h @@ -0,0 +1,201 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class AnchorGeneratorPlugin : public nvinfer1::IPluginV2Ext { + public: + explicit AnchorGeneratorPlugin( + const nvinfer1::DataType, const std::vector& anchor_sizes, + const std::vector& aspect_ratios, const std::vector& stride, + const std::vector& variances, const float offset, const int height, + const int width, const int num_anchors, const int box_num); + AnchorGeneratorPlugin(const void* data, size_t length); + ~AnchorGeneratorPlugin() override; + const char* getPluginType() const override; + const char* getPluginVersion() const override; + int getNbOutputs() const override; + nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, + int nb_input_dims) override; + bool supportsFormat(nvinfer1::DataType type, + nvinfer1::TensorFormat format) const override; + size_t getWorkspaceSize(int max_batch_size) const override; + int enqueue(int batch_size, const void* const* inputs, void** outputs, + void* workspace, cudaStream_t stream) override; + int initialize() override; + void terminate() override; + size_t getSerializationSize() const override; + void serialize(void* buffer) const override; + void destroy() override; + void setPluginNamespace(const char* lib_namespace) override; + const char* getPluginNamespace() const override; + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* input_type, + int nb_inputs) const override; + bool isOutputBroadcastAcrossBatch(int output_index, + const bool* input_is_broadcast, + int nb_inputs) const override; + bool canBroadcastInputAcrossBatch(int input_index) const override; + void configurePlugin(const nvinfer1::Dims* input_dims, int nb_inputs, + const nvinfer1::Dims* output_dims, int nb_outputs, + const nvinfer1::DataType* input_types, + const nvinfer1::DataType* output_types, + const bool* input_is_broadcast, + const bool* output_is_broadcast, + nvinfer1::PluginFormat float_format, + int max_batct_size) override; + nvinfer1::IPluginV2Ext* clone() const override; + + private: + template + int enqueue_impl(int batch_size, const void* const* inputs, void** outputs, + void* workspace, cudaStream_t stream); + nvinfer1::DataType data_type_; + std::vector anchor_sizes_; + std::vector aspect_ratios_; + std::vector stride_; + std::vector variances_; + float offset_; + void* anchor_sizes_device_; + void* aspect_ratios_device_; + void* stride_device_; + void* variances_device_; + int height_; + int width_; + int num_anchors_; + int box_num_; + std::string namespace_; +}; + +class AnchorGeneratorPluginCreator : public nvinfer1::IPluginCreator { + public: + AnchorGeneratorPluginCreator() = default; + ~AnchorGeneratorPluginCreator() override = default; + void setPluginNamespace(const char* lib_namespace) override; + const char* getPluginNamespace() const override; + const char* getPluginName() const override; + const char* getPluginVersion() const override; + const nvinfer1::PluginFieldCollection* getFieldNames() override; + nvinfer1::IPluginV2Ext* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override; + nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override; + + private: + std::string namespace_; + nvinfer1::PluginFieldCollection field_collection_; +}; + +REGISTER_TRT_PLUGIN_V2(AnchorGeneratorPluginCreator); + +#if IS_TRT_VERSION_GE(6000) +class AnchorGeneratorPluginDynamic : public DynamicPluginTensorRT { + public: + explicit AnchorGeneratorPluginDynamic(const nvinfer1::DataType data_type, + const std::vector& anchor_sizes, + const std::vector& aspect_ratios, + const std::vector& stride, + const std::vector& variances, + const float offset, + const int num_anchors); + AnchorGeneratorPluginDynamic(void const* data, size_t length); + ~AnchorGeneratorPluginDynamic(); + nvinfer1::IPluginV2DynamicExt* clone() const override; + nvinfer1::DimsExprs getOutputDimensions( + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) override; + bool supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, int nbOutputs) override; + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, + int nbOutputs) override; + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, + int nbOutputs) const override; + int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) override; + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* inputTypes, + int nbInputs) const override; + const char* getPluginType() const override; + int getNbOutputs() const override; + int initialize() override; + void terminate() override; + size_t getSerializationSize() const override; + void serialize(void* buffer) const override; + void destroy() override; + + private: + template + int enqueue_impl(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, + void* workspace, cudaStream_t stream); + nvinfer1::DataType data_type_; + std::vector anchor_sizes_; + std::vector aspect_ratios_; + std::vector stride_; + std::vector variances_; + float offset_; + void* anchor_sizes_device_; + void* aspect_ratios_device_; + void* stride_device_; + void* variances_device_; + int num_anchors_; + std::string namespace_; +}; + +class AnchorGeneratorPluginDynamicCreator : public nvinfer1::IPluginCreator { + public: + AnchorGeneratorPluginDynamicCreator() = default; + ~AnchorGeneratorPluginDynamicCreator() override = default; + void setPluginNamespace(const char* lib_namespace) override; + const char* getPluginNamespace() const override; + const char* getPluginName() const override; + const char* getPluginVersion() const override; + const nvinfer1::PluginFieldCollection* getFieldNames() override; + nvinfer1::IPluginV2Ext* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override; + nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override; + + private: + std::string namespace_; + nvinfer1::PluginFieldCollection field_collection_; +}; +REGISTER_TRT_PLUGIN_V2(AnchorGeneratorPluginDynamicCreator); +#endif + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu index 457d9dd87375477926480bce0a84e8f89c409698..cc17f8aa2481708e3e19c9925a1d83ad06203145 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu @@ -152,9 +152,14 @@ int ElementWisePlugin::enqueue(int batch_size, const void *const *inputs, int ElementwisePluginDynamic::initialize() { return 0; } -size_t ElementwisePluginDynamic::getSerializationSize() const { return 0; } +size_t ElementwisePluginDynamic::getSerializationSize() const { + return SerializedSize(type_.c_str()) + SerializedSize(axis_); +} -void ElementwisePluginDynamic::serialize(void *buffer) const {} +void ElementwisePluginDynamic::serialize(void *buffer) const { + SerializeValue(&buffer, type_.c_str()); + SerializeValue(&buffer, axis_); +} nvinfer1::DimsExprs ElementwisePluginDynamic::getOutputDimensions( int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h index e37511868d88f600a733df4ebb478e74a385be1b..75a1dd85f0f2c440fdd16beb95144df4127739e6 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h @@ -92,7 +92,12 @@ class ElementwisePluginDynamic : public DynamicPluginTensorRT { public: explicit ElementwisePluginDynamic(const std::string& type, int axis) : type_(type), axis_(axis) {} - ElementwisePluginDynamic(void const* serialData, size_t serialLength) {} + ElementwisePluginDynamic(void const* serialData, size_t serialLength) { + const char* elementwise_type; + DeserializeValue(&serialData, &serialLength, &elementwise_type); + type_ = std::string(elementwise_type); + DeserializeValue(&serialData, &serialLength, &axis_); + } nvinfer1::IPluginV2DynamicExt* clone() const override { return new ElementwisePluginDynamic(type_, axis_); } @@ -138,6 +143,46 @@ class ElementwisePluginDynamic : public DynamicPluginTensorRT { std::string type_; int axis_; }; + +class ElementwisePluginDynamicCreator : public nvinfer1::IPluginCreator { + public: + ElementwisePluginDynamicCreator() {} + const char* getPluginName() const override { return "elementwise_plugin"; } + + const char* getPluginVersion() const override { return "1"; } + + const nvinfer1::PluginFieldCollection* getFieldNames() override { + return &field_collection_; + } + + nvinfer1::IPluginV2* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override { + return nullptr; + } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + auto plugin = new ElementwisePluginDynamic(serial_data, serial_length); + return plugin; + } + + void setPluginNamespace(const char* lib_namespace) override { + plugin_namespace_ = lib_namespace; + } + + const char* getPluginNamespace() const override { + return plugin_namespace_.c_str(); + } + + private: + std::string plugin_namespace_; + std::string plugin_name_; + nvinfer1::PluginFieldCollection field_collection_{0, nullptr}; + std::vector plugin_attributes_; +}; + +REGISTER_TRT_PLUGIN_V2(ElementwisePluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu index 238daa4a886a48036cdcd29a1173509b791254fd..6d3872aaeb8a77acf1455e4d5e555ee01d36478a 100644 --- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu @@ -200,12 +200,10 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic::getOutputDimensions( "but it's (%d)", output_index)); nvinfer1::DimsExprs ret; - ret.nbDims = 5; + ret.nbDims = 3; ret.d[0] = inputs[0].d[0]; ret.d[1] = inputs[0].d[1]; ret.d[2] = expr_builder.constant(hidden_size_); - ret.d[3] = expr_builder.constant(1); - ret.d[4] = expr_builder.constant(1); return ret; } diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h index 6c8381a750cba96796dd063dd54de779b5933a9f..7de84a8fc49bcc4cf94e8d406ab0362cbacfb175 100644 --- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h @@ -306,9 +306,10 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { } }; -class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator { +class EmbEltwiseLayernormPluginDynamicCreator + : public nvinfer1::IPluginCreator { public: - EmbEltwiseLayernormPluginV2Creator() {} + EmbEltwiseLayernormPluginDynamicCreator() {} const char* getPluginName() const override { return "fused_embedding_eltwise_layernorm_plugin"; } @@ -345,7 +346,7 @@ class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator { std::vector plugin_attributes_; }; -REGISTER_TRT_PLUGIN_V2(EmbEltwiseLayernormPluginV2Creator); +REGISTER_TRT_PLUGIN_V2(EmbEltwiseLayernormPluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h index 979f600a3a9cea0ab5bc35fc0c2882cf34c82c98..23e507ee477e1a3b85339c7b267b290de19805ab 100644 --- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h @@ -115,9 +115,9 @@ class GeluPluginDynamic : public DynamicPluginTensorRT { void destroy() override { delete this; } }; -class GeluPluginV2Creator : public nvinfer1::IPluginCreator { +class GeluPluginDynamicCreator : public nvinfer1::IPluginCreator { public: - GeluPluginV2Creator() {} + GeluPluginDynamicCreator() {} const char* getPluginName() const override { return "gelu_plugin"; } const char* getPluginVersion() const override { return "1"; } @@ -153,7 +153,7 @@ class GeluPluginV2Creator : public nvinfer1::IPluginCreator { std::vector plugin_attributes_; }; -REGISTER_TRT_PLUGIN_V2(GeluPluginV2Creator); +REGISTER_TRT_PLUGIN_V2(GeluPluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index 1e7c83f4c60fb99964bf583087e7fd1f8c32d704..214e1a81e7dc04161a07f4c0bec643bf65b6c9f0 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -169,12 +169,10 @@ nvinfer1::DimsExprs QkvToContextPluginDynamic::getOutputDimensions( "it has (%d) inputs", nb_inputs)); nvinfer1::DimsExprs ret; - ret.nbDims = 5; + ret.nbDims = 3; ret.d[0] = inputs[0].d[0]; ret.d[1] = inputs[0].d[1]; ret.d[2] = expr_builder.constant(head_size_ * head_number_); - ret.d[3] = expr_builder.constant(1); - ret.d[4] = expr_builder.constant(1); return ret; } @@ -227,6 +225,14 @@ nvinfer1::DataType QkvToContextPluginDynamic::getOutputDataType( return input_types[0]; } +template +__global__ void apply_scale(T *data, T scale, int n) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + int tid = blockIdx.x * blockDim.x + threadIdx.x; + data[tid] = data[tid] * scale; +#endif +} + int QkvToContextPluginDynamic::enqueue( const nvinfer1::PluginTensorDesc *input_desc, const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs, @@ -293,10 +299,17 @@ int QkvToContextPluginDynamic::enqueue( platform::DeviceContextPool::Instance().Get( platform::CUDAPlace(device_id))); + int n_q = seq_len * head_number_ * head_size_; + constexpr int threads = 128; + int blocks = (n_q + threads - 1) / threads; + + apply_scale<<>>(tptr, static_cast(scale_), + n_q); + const platform::CUDADeviceContext &dev_ctx = *device_ctx; operators::math::MultiHeadGPUComputeFunctor multihead_compute_func; multihead_compute_func(dev_ctx, batch, seq_len, head_number_, head_size_, - qkptr, input1_data, tptr, half(scale_), half(0.0)); + qkptr, input1_data, tptr, half(1.), half(0.0)); int grid = batch * head_number_ * seq_len; int block = head_size_; diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h index b852f5a454c07ca9684f7bb12aa62275c3121de3..7147d9855755bec0fff814c32edf391269b6fe03 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h @@ -118,9 +118,9 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT { float scale_; }; -class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator { +class QkvToContextPluginDynamicCreator : public nvinfer1::IPluginCreator { public: - QkvToContextPluginV2Creator() {} + QkvToContextPluginDynamicCreator() {} const char* getPluginName() const override { return "qkv_to_context_plugin"; } const char* getPluginVersion() const override { return "1"; } @@ -155,7 +155,7 @@ class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator { nvinfer1::PluginFieldCollection field_collection_; std::vector plugin_attributes_; }; -REGISTER_TRT_PLUGIN_V2(QkvToContextPluginV2Creator); +REGISTER_TRT_PLUGIN_V2(QkvToContextPluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu new file mode 100644 index 0000000000000000000000000000000000000000..6e7ed0054f502ea014d3648ac0be22c167987735 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu @@ -0,0 +1,381 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +template +__inline__ __device__ T BilinearInterpolate(const T* input_data, + const int height, const int width, + T y, T x) { + if (y < -1.f || y > height || x < -1.f || x > width) return 0; + y = y <= 0.f ? 0.f : y; + x = x <= 0.f ? 0.f : x; + int y_low = static_cast(y); + int x_low = static_cast(x); + int y_high; + int x_high; + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = static_cast(y_low); + } else { + y_high = y_low + 1; + } + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = static_cast(x_low); + } else { + x_high = x_low + 1; + } + T ly = y - y_low, lx = x - x_low; + T hy = 1.f - ly, hx = 1.f - lx; + T v1 = input_data[y_low * width + x_low]; + T v2 = input_data[y_low * width + x_high]; + T v3 = input_data[y_high * width + x_low]; + T v4 = input_data[y_high * width + x_high]; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +template +__global__ void GPUROIAlignOpt(const int nthreads, + const T* __restrict__ input_data, + const T* __restrict__ input_rois, + const float spatial_scale, const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int sampling_ratio, const int num_rois, + OutT* __restrict__ output_data) { + const int batch = blockIdx.x; + const int channel = blockIdx.y; + const T* offset_input_data = + input_data + (batch * channels + channel) * height * width; + extern __shared__ T s_input_data[]; + if (USE_SMEM) { + for (int idx = threadIdx.x; idx < height * width; idx += blockDim.x) { + s_input_data[idx] = offset_input_data[idx]; + } + __syncthreads(); + } + for (int idx = threadIdx.x; idx < num_rois * pooled_height * pooled_width; + idx += blockDim.x) { + const int pw = idx % pooled_width; + const int ph = (idx / pooled_width) % pooled_height; + const int roi_idx = (idx / pooled_width / pooled_height) % num_rois; + const int n = batch * num_rois + roi_idx; + const float4 rois_offset = reinterpret_cast(input_rois)[n]; + const T roi_xmin = rois_offset.x * spatial_scale; + const T roi_ymin = rois_offset.y * spatial_scale; + const T roi_xmax = rois_offset.z * spatial_scale; + const T roi_ymax = rois_offset.w * spatial_scale; + const T roi_width = max(roi_xmax - roi_xmin, static_cast(1.f)); + const T roi_height = max(roi_ymax - roi_ymin, static_cast(1.f)); + const T bin_size_h = roi_height / static_cast(pooled_height); + const T bin_size_w = roi_width / static_cast(pooled_width); + const int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); + const int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + const T count = roi_bin_grid_h * roi_bin_grid_w; + + T output_val = 0.f; + for (int iy = 0; iy < roi_bin_grid_h; ++iy) { + const T y = roi_ymin + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ++ix) { + const T x = roi_xmin + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + if (USE_SMEM) { + T val = BilinearInterpolate(s_input_data, height, width, y, x); + output_val += val; + } else { + T val = + BilinearInterpolate(offset_input_data, height, width, y, x); + output_val += val; + } + } + } + output_val /= count; + const int out_offset = + batch * num_rois * channels * pooled_height * pooled_width + + roi_idx * channels * pooled_height * pooled_width + + channel * pooled_height * pooled_width + ph * pooled_width + pw; + output_data[out_offset] = static_cast(output_val); + } +} + +#if IS_TRT_VERSION_GE(6000) +RoiAlignPluginDynamic::RoiAlignPluginDynamic(const nvinfer1::DataType data_type, + const int pooled_height, + const int pooled_width, + float spatial_scale, + int sampling_ratio) + : data_type_(data_type), + pooled_height_(pooled_height), + pooled_width_(pooled_width), + spatial_scale_(spatial_scale), + sampling_ratio_(sampling_ratio) { + bool data_type_is_valid = data_type_ == nvinfer1::DataType::kFLOAT || + data_type_ == nvinfer1::DataType::kHALF; + PADDLE_ENFORCE_EQ(data_type_is_valid, true, + platform::errors::InvalidArgument( + "TRT RoiAlign plugin only accepts kFLOAT(%d) or " + "kHALF(%d) data type, but the received data type = %d", + static_cast(nvinfer1::DataType::kFLOAT), + static_cast(nvinfer1::DataType::kHALF), + static_cast(data_type_))); + + PADDLE_ENFORCE_GT(pooled_height_, 0, + platform::errors::InvalidArgument( + "TRT RoiAlign plugin only accepts pooled_height " + "greater than %d, but the received pooled_height = %d", + 0, pooled_height_)); + + PADDLE_ENFORCE_GT(pooled_width_, 0, + platform::errors::InvalidArgument( + "TRT RoiAlign plugin only accepts pooled_width greater " + "than %d, but the received pooled_width = %d", + 0, pooled_height_)); + + PADDLE_ENFORCE_GT(spatial_scale_, 0.f, + platform::errors::InvalidArgument( + "TRT RoiAlign plugin only accepts spatial_scale " + "greater than %f, but the received spatial_scale = %f", + 0, spatial_scale_)); + + int smem_per_block = -1; + int device = -1; + cudaGetDevice(&device); + + PADDLE_ENFORCE_GE( + device, 0, + platform::errors::InvalidArgument( + "The cuda device ID should be greater than %d, but device ID is %d", + 0, device)); + + cudaDeviceGetAttribute(&smem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, + device); + smem_per_block_ = smem_per_block; +} + +RoiAlignPluginDynamic::RoiAlignPluginDynamic(void const* data, size_t length) { + DeserializeValue(&data, &length, &data_type_); + DeserializeValue(&data, &length, &pooled_height_); + DeserializeValue(&data, &length, &pooled_width_); + DeserializeValue(&data, &length, &spatial_scale_); + DeserializeValue(&data, &length, &sampling_ratio_); + int smem_per_block = -1; + int device = -1; + cudaGetDevice(&device); + PADDLE_ENFORCE_GE( + device, 0, + platform::errors::InvalidArgument( + "The cuda device ID should be greater than %d, but device ID is %d", + 0, device)); + cudaDeviceGetAttribute(&smem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, + device); + smem_per_block_ = smem_per_block; +} + +nvinfer1::IPluginV2DynamicExt* RoiAlignPluginDynamic::clone() const { + auto* plugin = + new RoiAlignPluginDynamic(data_type_, pooled_height_, pooled_width_, + spatial_scale_, sampling_ratio_); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} + +nvinfer1::DimsExprs RoiAlignPluginDynamic::getOutputDimensions( + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) { + nvinfer1::DimsExprs ret{}; + ret.nbDims = 4; + ret.d[0] = inputs[1].d[0]; // roi + ret.d[1] = inputs[0].d[1]; // X + ret.d[2] = exprBuilder.constant(pooled_height_); + ret.d[3] = exprBuilder.constant(pooled_width_); + return ret; +} + +bool RoiAlignPluginDynamic::supportsFormatCombination( + int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, + int nbOutputs) { + if (inOut[pos].format != nvinfer1::TensorFormat::kLINEAR) { + return false; + } + if (pos < 2) { // input + return inOut[pos].type == nvinfer1::DataType::kFLOAT; + } + return inOut[pos].type == data_type_; +} + +void RoiAlignPluginDynamic::configurePlugin( + const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {} + +size_t RoiAlignPluginDynamic::getWorkspaceSize( + const nvinfer1::PluginTensorDesc* inputs, int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const { + return 0; +} + +template +int RoiAlignPluginDynamic::enqueue_impl( + const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, + void* const* outputs, void* workspace, cudaStream_t stream) { + auto in_dims = inputDesc[0].dims; + auto rois_dims = inputDesc[1].dims; + auto out_dims = outputDesc[0].dims; + + int rois_num = rois_dims.d[0]; + if (rois_num == 0) return cudaGetLastError() != cudaSuccess; + + int batch = in_dims.d[0]; + int channels = in_dims.d[1]; + int height = in_dims.d[2]; + int width = in_dims.d[3]; + + int output_size = + out_dims.d[0] * out_dims.d[1] * out_dims.d[2] * out_dims.d[3]; + + const dim3 blocks(batch, channels); + const int threads = 512; + + if (smem_per_block_ < width * height * sizeof(T)) { + GPUROIAlignOpt<<>>( + output_size, static_cast(inputs[0]), + static_cast(inputs[1]), spatial_scale_, channels, height, + width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch, + static_cast(outputs[0])); + } else { + GPUROIAlignOpt< + T, OutT, true><<>>( + output_size, static_cast(inputs[0]), + static_cast(inputs[1]), spatial_scale_, channels, height, + width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch, + static_cast(outputs[0])); + } + + return cudaGetLastError() != cudaSuccess; +} + +int RoiAlignPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, + void* const* outputs, void* workspace, + cudaStream_t stream) { + PADDLE_ENFORCE_EQ(outputDesc[0].type, data_type_, + platform::errors::InvalidArgument( + "TRT RoiAlignPluginDynamic expects outputDesc[0].type " + "equal to data_type_")); + + if (data_type_ == nvinfer1::DataType::kHALF) { + return enqueue_impl(inputDesc, outputDesc, inputs, outputs, + workspace, stream); + } + return enqueue_impl(inputDesc, outputDesc, inputs, outputs, + workspace, stream); +} + +nvinfer1::DataType RoiAlignPluginDynamic::getOutputDataType( + int index, const nvinfer1::DataType* inputTypes, int nbInputs) const { + return data_type_; +} + +const char* RoiAlignPluginDynamic::getPluginType() const { + return "roi_align_plugin_dynamic"; +} + +int RoiAlignPluginDynamic::getNbOutputs() const { return 1; } + +int RoiAlignPluginDynamic::initialize() { return 0; } + +void RoiAlignPluginDynamic::terminate() {} + +size_t RoiAlignPluginDynamic::getSerializationSize() const { + size_t serialize_size = 0; + serialize_size += SerializedSize(data_type_); + serialize_size += SerializedSize(pooled_height_); + serialize_size += SerializedSize(pooled_width_); + serialize_size += SerializedSize(spatial_scale_); + serialize_size += SerializedSize(sampling_ratio_); + return serialize_size; +} + +void RoiAlignPluginDynamic::serialize(void* buffer) const { + SerializeValue(&buffer, data_type_); + SerializeValue(&buffer, pooled_height_); + SerializeValue(&buffer, pooled_width_); + SerializeValue(&buffer, spatial_scale_); + SerializeValue(&buffer, sampling_ratio_); +} + +void RoiAlignPluginDynamic::destroy() {} + +RoiAlignPluginDynamicCreator::RoiAlignPluginDynamicCreator() {} + +void RoiAlignPluginDynamicCreator::setPluginNamespace( + const char* lib_namespace) { + namespace_ = std::string(lib_namespace); +} + +const char* RoiAlignPluginDynamicCreator::getPluginNamespace() const { + return namespace_.c_str(); +} + +const char* RoiAlignPluginDynamicCreator::getPluginName() const { + return "roi_align_plugin_dynamic"; +} + +const char* RoiAlignPluginDynamicCreator::getPluginVersion() const { + return "1"; +} + +const nvinfer1::PluginFieldCollection* +RoiAlignPluginDynamicCreator::getFieldNames() { + return &field_collection_; +} + +nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) { + const nvinfer1::PluginField* fields = fc->fields; + return nullptr; +} + +nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::deserializePlugin( + const char* name, const void* serial_data, size_t serial_length) { + auto plugin = new RoiAlignPluginDynamic(serial_data, serial_length); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} +#endif + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h new file mode 100644 index 0000000000000000000000000000000000000000..bba7d0d5a996641495fba5b8f406bdf37148babe --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h @@ -0,0 +1,112 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +#if IS_TRT_VERSION_GE(6000) +class RoiAlignPluginDynamic : public DynamicPluginTensorRT { + public: + explicit RoiAlignPluginDynamic(const nvinfer1::DataType data_type, + const int pooled_height, + const int pooled_width, float spatial_scale, + int sampling_ratio); + RoiAlignPluginDynamic(void const* data, size_t length); + ~RoiAlignPluginDynamic() = default; + nvinfer1::IPluginV2DynamicExt* clone() const override; + nvinfer1::DimsExprs getOutputDimensions( + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) override; + bool supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, int nbOutputs) override; + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, + int nbOutputs) override; + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, + int nbOutputs) const override; + int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) override; + + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* inputTypes, + int nbInputs) const override; + + const char* getPluginType() const override; + int getNbOutputs() const override; + int initialize() override; + void terminate() override; + size_t getSerializationSize() const override; + void serialize(void* buffer) const override; + void destroy() override; + + private: + template + int enqueue_impl(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, + void* workspace, cudaStream_t stream); + + nvinfer1::DataType data_type_; + int pooled_height_; + int pooled_width_; + float spatial_scale_; + int sampling_ratio_; + int smem_per_block_; + std::string namespace_; +}; + +class RoiAlignPluginDynamicCreator : public nvinfer1::IPluginCreator { + public: + RoiAlignPluginDynamicCreator(); + ~RoiAlignPluginDynamicCreator() override = default; + + void setPluginNamespace(const char* lib_namespace) override; + const char* getPluginNamespace() const override; + const char* getPluginName() const override; + const char* getPluginVersion() const override; + const nvinfer1::PluginFieldCollection* getFieldNames() override; + + nvinfer1::IPluginV2Ext* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override; + nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override; + + private: + std::string namespace_; + nvinfer1::PluginFieldCollection field_collection_; +}; +REGISTER_TRT_PLUGIN_V2(RoiAlignPluginDynamicCreator); +#endif + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu index 3b9eea22199d7b1669802fb506fb4218529b4468..7be9e3a740ab1c3532f5a67f06048c6c745eb214 100644 --- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu @@ -54,11 +54,6 @@ void SkipLayerNormPluginDynamic::terminate() { nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions( int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, nvinfer1::IExprBuilder &expr_builder) { - PADDLE_ENFORCE_EQ( - inputs[0].nbDims, 5, - platform::errors::InvalidArgument( - "The Input dim of the SkipLayernorm should be 5, but it's (%d) now.", - inputs[0].nbDims)); return inputs[0]; } diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h index 0e457fdc8f4474e4f7152aac3520193d70b22e65..ac621784550f2f15f69b17fb5fcbd61f32e2eba7 100644 --- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h @@ -119,9 +119,9 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { float eps_; }; -class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator { +class SkipLayerNormPluginDynamicCreator : public nvinfer1::IPluginCreator { public: - SkipLayerNormPluginV2Creator() {} + SkipLayerNormPluginDynamicCreator() {} const char* getPluginName() const override { return "skip_layernorm_plugin"; } const char* getPluginVersion() const override { return "1"; } @@ -156,7 +156,7 @@ class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator { nvinfer1::PluginFieldCollection field_collection_; std::vector plugin_attributes_; }; -REGISTER_TRT_PLUGIN_V2(SkipLayerNormPluginV2Creator); +REGISTER_TRT_PLUGIN_V2(SkipLayerNormPluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h index 340406c5e7fae8bf3f298228259e9fa33fc76887..9d4f9a35c3b6fe02981853eb3c0a697d5cb3a199 100644 --- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h @@ -121,9 +121,9 @@ class SlicePluginDynamic : public DynamicPluginTensorRT { cudaStream_t copy_stream_; }; -class SlicePluginV2Creator : public nvinfer1::IPluginCreator { +class SlicePluginDynamicCreator : public nvinfer1::IPluginCreator { public: - SlicePluginV2Creator() {} + SlicePluginDynamicCreator() {} const char* getPluginName() const override { return "slice_plugin"; } const char* getPluginVersion() const override { return "1"; } @@ -155,7 +155,7 @@ class SlicePluginV2Creator : public nvinfer1::IPluginCreator { nvinfer1::PluginFieldCollection field_collection_; }; -REGISTER_TRT_PLUGIN_V2(SlicePluginV2Creator); +REGISTER_TRT_PLUGIN_V2(SlicePluginDynamicCreator); #endif diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu index 250b944652b93c54ae9587271256b42c6e1bc6b7..fdb14f9ceaf29fe90cd756b77e7c5afff2296f44 100644 --- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu @@ -62,6 +62,8 @@ nvinfer1::DimsExprs SpecialSlicePluginDynamic::getOutputDimensions( output.d[1] = one; output.d[0] = expr_builder.operation(nvinfer1::DimensionOperation::kSUB, *inputs[1].d[0], *one); + // remove padding 1 + output.nbDims -= 2; return output; } diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index 256aa28206ad1c21dc3245a6e78f7cdc59b29156..1b5c39f8fff855fac4ef8f2ee54faa872023ad05 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -22,11 +22,6 @@ namespace inference { namespace tensorrt { namespace plugin { -SplitPlugin* CreateSplitPluginDeserialize(const void* buffer, size_t length) { - return new SplitPlugin(buffer, length); -} -REGISTER_TRT_PLUGIN("split_plugin", CreateSplitPluginDeserialize); - template __device__ int upper_bound(T const* vals, int n, T const& key) { int i = 0; diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h index 5c47ec3a990f584fd02b3515dbc642ffcd921709..1ee895154d6b046c6c18c2e374d3c63f1fcc5d62 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -25,7 +25,7 @@ namespace inference { namespace tensorrt { namespace plugin { -class SplitPlugin : public PluginTensorRT { +class SplitPlugin : public PluginTensorRTV2Ext { public: SplitPlugin() {} SplitPlugin(int axis, std::vector const& output_lengths, bool with_fp16) @@ -39,13 +39,20 @@ class SplitPlugin : public PluginTensorRT { DeserializeValue(&serial_data, &serial_length, &output_length_); } - SplitPlugin* clone() const override { - auto* ptr = new SplitPlugin(axis_, output_length_, with_fp16_); + nvinfer1::IPluginV2Ext* clone() const override { + SplitPlugin* ptr = new SplitPlugin(axis_, output_length_, with_fp16_); + ptr->setPluginNamespace(this->getPluginNamespace()); ptr->shareData(this); return ptr; } - const char* getPluginType() const override { return "split_plugin"; } + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* input_types, + int nb_inputs) const override { + return input_types[0]; + } + + const char* getPluginType() const override { return "split_plugin_v2ext"; } int getNbOutputs() const override { return output_length_.size(); } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* input_dims, @@ -53,17 +60,18 @@ class SplitPlugin : public PluginTensorRT { int initialize() override; void terminate() override; - int enqueue(int batchSize, const void* const* inputs, void** outputs, + int enqueue(int batch_size, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override; + void destroy() override { delete this; } + protected: - size_t getSerializationSize() override { - return SerializedSize(getPluginType()) + SerializedSize(axis_) + - SerializedSize(output_length_) + getBaseSerializationSize(); + size_t getSerializationSize() const override { + return SerializedSize(axis_) + SerializedSize(output_length_) + + getBaseSerializationSize(); } - void serialize(void* buffer) override { - SerializeValue(&buffer, getPluginType()); + void serialize(void* buffer) const override { serializeBase(buffer); SerializeValue(&buffer, axis_); SerializeValue(&buffer, output_length_); @@ -83,6 +91,47 @@ class SplitPlugin : public PluginTensorRT { void shareData(const SplitPlugin* another); }; +class SplitPluginCreator : public nvinfer1::IPluginCreator { + public: + SplitPluginCreator() {} + const char* getPluginName() const override { return "split_plugin_v2ext"; } + + const char* getPluginVersion() const override { return "1"; } + + const nvinfer1::PluginFieldCollection* getFieldNames() override { + return &field_collection_; + } + + nvinfer1::IPluginV2* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override { + // not implemented + return nullptr; + } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + auto plugin = new SplitPlugin(serial_data, serial_length); + return plugin; + } + + void setPluginNamespace(const char* lib_namespace) override { + plugin_namespace_ = lib_namespace; + } + + const char* getPluginNamespace() const override { + return plugin_namespace_.c_str(); + } + + private: + std::string plugin_namespace_; + std::string plugin_name_; + nvinfer1::PluginFieldCollection field_collection_{0, nullptr}; + std::vector plugin_attributes_; +}; + +REGISTER_TRT_PLUGIN_V2(SplitPluginCreator); + #if IS_TRT_VERSION_GE(6000) class SplitPluginDynamic : public DynamicPluginTensorRT { public: @@ -144,9 +193,9 @@ class SplitPluginDynamic : public DynamicPluginTensorRT { std::vector output_length_; }; -class SplitPluginV2Creator : public nvinfer1::IPluginCreator { +class SplitPluginDynamicCreator : public nvinfer1::IPluginCreator { public: - SplitPluginV2Creator() {} + SplitPluginDynamicCreator() {} const char* getPluginName() const override { return "split_plugin"; } const char* getPluginVersion() const override { return "1"; } @@ -182,7 +231,7 @@ class SplitPluginV2Creator : public nvinfer1::IPluginCreator { std::vector plugin_attributes_; }; -REGISTER_TRT_PLUGIN_V2(SplitPluginV2Creator); +REGISTER_TRT_PLUGIN_V2(SplitPluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h index 85cc6916238fefa028310b07e02301f10e07aefd..11579aadcc45731123770352ef08b362ff3ef745 100644 --- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h @@ -127,9 +127,9 @@ class SwishPluginDynamic : public DynamicPluginTensorRT { float beta_; }; -class SwishPluginV2Creator : public nvinfer1::IPluginCreator { +class SwishPluginDynamicCreator : public nvinfer1::IPluginCreator { public: - SwishPluginV2Creator() {} + SwishPluginDynamicCreator() {} const char* getPluginName() const override { return "swish_plugin"; } const char* getPluginVersion() const override { return "1"; } @@ -165,7 +165,7 @@ class SwishPluginV2Creator : public nvinfer1::IPluginCreator { std::vector plugin_attributes_; }; -REGISTER_TRT_PLUGIN_V2(SwishPluginV2Creator); +REGISTER_TRT_PLUGIN_V2(SwishPluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc new file mode 100644 index 0000000000000000000000000000000000000000..6636513a555f9e638e1dfdb54986010c76785e2a --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +TEST(split_op_plugin, test_plugin) { + int axis = 1; + std::vector output_lengths{1, 1}; + bool with_fp16 = false; + std::vector input_types{nvinfer1::DataType::kFLOAT}; + std::vector input_dims; + + SplitPlugin sp_plugin(axis, output_lengths, with_fp16); + nvinfer1::Dims in_dims; + in_dims.nbDims = 4; + input_dims.push_back(in_dims); + sp_plugin.configurePlugin(input_dims.data(), 1, nullptr, 2, + input_types.data(), nullptr, nullptr, nullptr, + nvinfer1::PluginFormat::kNCHW, 4); + sp_plugin.initialize(); + sp_plugin.getPluginType(); + sp_plugin.canBroadcastInputAcrossBatch(0); + sp_plugin.getNbOutputs(); + auto clone_plugin = sp_plugin.clone(); + clone_plugin->setPluginNamespace("test"); + clone_plugin->destroy(); + sp_plugin.getOutputDataType(0, input_types.data(), 1); + sp_plugin.terminate(); +} + +TEST(split_op_plugin, test_plugin_creater) { + SplitPluginCreator creator; + creator.getFieldNames(); + creator.createPlugin("test", nullptr); + creator.setPluginNamespace("test"); +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc index fd721b161450d7a8d4660ca09ea3a1093d754664..55bc786746beafcf7b2df98d54e9391e6a59ba24 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc @@ -19,27 +19,50 @@ namespace inference { namespace tensorrt { namespace plugin { +inline void Seria(void*& buffer, // NOLINT + const std::vector& input_dims, + size_t max_batch_size, nvinfer1::DataType data_type, + nvinfer1::PluginFormat data_format, bool with_fp16) { + SerializeValue(&buffer, input_dims); + SerializeValue(&buffer, max_batch_size); + SerializeValue(&buffer, data_type); + SerializeValue(&buffer, data_format); + SerializeValue(&buffer, with_fp16); +} + +inline void Deseria(void const*& serial_data, size_t& serial_length, // NOLINT + std::vector* input_dims, + size_t* max_batch_size, nvinfer1::DataType* data_type, + nvinfer1::PluginFormat* data_format, bool* with_fp16) { + DeserializeValue(&serial_data, &serial_length, input_dims); + DeserializeValue(&serial_data, &serial_length, max_batch_size); + DeserializeValue(&serial_data, &serial_length, data_type); + DeserializeValue(&serial_data, &serial_length, data_format); + DeserializeValue(&serial_data, &serial_length, with_fp16); +} + +inline size_t SeriaSize(const std::vector& input_dims, + size_t max_batch_size, nvinfer1::DataType data_type, + nvinfer1::PluginFormat data_format, bool with_fp16) { + return (SerializedSize(input_dims) + SerializedSize(max_batch_size) + + SerializedSize(data_type) + SerializedSize(data_format) + + SerializedSize(with_fp16)); +} + void PluginTensorRT::serializeBase(void*& buffer) { - SerializeValue(&buffer, input_dims_); - SerializeValue(&buffer, max_batch_size_); - SerializeValue(&buffer, data_type_); - SerializeValue(&buffer, data_format_); - SerializeValue(&buffer, with_fp16_); + Seria(buffer, input_dims_, max_batch_size_, data_type_, data_format_, + with_fp16_); } void PluginTensorRT::deserializeBase(void const*& serial_data, size_t& serial_length) { - DeserializeValue(&serial_data, &serial_length, &input_dims_); - DeserializeValue(&serial_data, &serial_length, &max_batch_size_); - DeserializeValue(&serial_data, &serial_length, &data_type_); - DeserializeValue(&serial_data, &serial_length, &data_format_); - DeserializeValue(&serial_data, &serial_length, &with_fp16_); + Deseria(serial_data, serial_length, &input_dims_, &max_batch_size_, + &data_type_, &data_format_, &with_fp16_); } size_t PluginTensorRT::getBaseSerializationSize() { - return (SerializedSize(input_dims_) + SerializedSize(max_batch_size_) + - SerializedSize(data_type_) + SerializedSize(data_format_) + - SerializedSize(with_fp16_)); + return SeriaSize(input_dims_, max_batch_size_, data_type_, data_format_, + with_fp16_); } bool PluginTensorRT::supportsFormat(nvinfer1::DataType type, @@ -58,6 +81,35 @@ void PluginTensorRT::configureWithFormat( max_batch_size_ = max_batch_size; } +void PluginTensorRTV2Ext::serializeBase(void*& buffer) const { + Seria(buffer, input_dims_, max_batch_size_, data_type_, data_format_, + with_fp16_); +} + +void PluginTensorRTV2Ext::deserializeBase(void const*& serial_data, + size_t& serial_length) { + Deseria(serial_data, serial_length, &input_dims_, &max_batch_size_, + &data_type_, &data_format_, &with_fp16_); +} + +size_t PluginTensorRTV2Ext::getBaseSerializationSize() const { + return SeriaSize(input_dims_, max_batch_size_, data_type_, data_format_, + with_fp16_); +} + +void PluginTensorRTV2Ext::configurePlugin( + const nvinfer1::Dims* input_dims, int32_t nb_inputs, + const nvinfer1::Dims* output_dims, int32_t nb_outputs, + const nvinfer1::DataType* input_types, + const nvinfer1::DataType* output_types, const bool* input_is_broadcast, + const bool* output_is_broadcast, nvinfer1::PluginFormat float_format, + int32_t max_batch_size) { + input_dims_.assign(input_dims, input_dims + nb_inputs); + max_batch_size_ = max_batch_size; + data_format_ = float_format; + data_type_ = input_types[0]; +} + } // namespace plugin } // namespace tensorrt } // namespace inference diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h index b3a3abe5d01fc53e2ef3da7722df0e372d605af4..ce3133ae99e94c62c0c8e958065700373d270037 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h @@ -44,6 +44,7 @@ typedef std::function typedef std::function PluginConstructFunc; +// Deprecated. Do not inherit this class, please refer to PluginTensorRTV2Ext class PluginTensorRT : public nvinfer1::IPluginExt { public: PluginTensorRT() : with_fp16_(false) {} @@ -119,6 +120,114 @@ class PluginTensorRT : public nvinfer1::IPluginExt { bool with_fp16_; }; +// TensorRT introduced IPluginV2Ext after 5.1, Paddle no longer supports +// versions before 5.1 +class PluginTensorRTV2Ext : public nvinfer1::IPluginV2Ext { + public: + PluginTensorRTV2Ext() : with_fp16_(false) {} + PluginTensorRTV2Ext(const void* serialized_data, size_t length) {} + + nvinfer1::Dims const& getInputDims(int index) const { + return input_dims_.at(index); + } + size_t getMaxBatchSize() const { return max_batch_size_; } + nvinfer1::DataType getDataType() const { return data_type_; } + nvinfer1::PluginFormat getDataFormat() const { return data_format_; } + + // The Func in IPluginV2Ext + virtual nvinfer1::DataType getOutputDataType( + int index, const nvinfer1::DataType* input_types, + int nb_inputs) const = 0; + + virtual bool isOutputBroadcastAcrossBatch(int32_t output_index, + const bool* input_is_broadcasted, + int32_t nb_inputs) const { + return false; + } + + virtual bool canBroadcastInputAcrossBatch(int32_t input_index) const { + return false; + } + + void configurePlugin(const nvinfer1::Dims* input_dims, int32_t nb_inputs, + const nvinfer1::Dims* output_dims, int32_t nb_outputs, + const nvinfer1::DataType* input_types, + const nvinfer1::DataType* output_types, + const bool* input_is_broadcast, + const bool* output_is_broadcast, + nvinfer1::PluginFormat float_format, + int32_t max_batch_size) override; + + virtual IPluginV2Ext* clone() const = 0; + + void attachToContext(cudnnContext*, cublasContext*, + nvinfer1::IGpuAllocator*) override {} + + void detachFromContext() override {} + + // The Func in IPluginV2 + virtual const char* getPluginType() const = 0; + const char* getPluginVersion() const override { return "1"; } + virtual int32_t getNbOutputs() const { return 1; } + virtual nvinfer1::Dims getOutputDimensions(int32_t index, + const nvinfer1::Dims* inputs, + int32_t nb_input) = 0; + // Check format support. The default is FLOAT32 and NCHW. + bool supportsFormat(nvinfer1::DataType type, + nvinfer1::PluginFormat format) const override { + return ((type == nvinfer1::DataType::kFLOAT) && + (format == nvinfer1::PluginFormat::kNCHW)); + } + // Initialize the layer for execution. + // This is called when the engine is created. + int initialize() override { return 0; } + + // Shutdown the layer. This is called when the engine is destroyed + void terminate() override {} + + // Find the workspace size required by the layer + size_t getWorkspaceSize(int) const override { return 0; } + + // Execute the layer + virtual int enqueue(int batch_size, const void* const* inputs, void** outputs, + void* workspace, cudaStream_t stream) = 0; + + // Find the size of the serialization buffer required + virtual size_t getSerializationSize() const = 0; + + // Serialize the layer config to buffer. + // TensorRT will call this func to serialize the configuration of TensorRT + // engine. It should not be called by users. + virtual void serialize(void* buffer) const = 0; + + virtual void destroy() = 0; + + void setPluginNamespace(const char* plugin_namespace) override { + name_space_ = plugin_namespace; + } + + const char* getPluginNamespace() const override { + return name_space_.c_str(); + } + + protected: + void deserializeBase(void const*& serial_data, // NOLINT + size_t& serial_length); // NOLINT + size_t getBaseSerializationSize() const; + void serializeBase(void*& buffer) const; // NOLINT + + protected: + std::vector input_dims_; + size_t max_batch_size_; + nvinfer1::DataType data_type_; + nvinfer1::PluginFormat data_format_; + std::vector inputs_; + bool with_fp16_; + + private: + std::string name_space_; +}; + #if IS_TRT_VERSION_GE(6000) class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt { public: @@ -184,6 +293,7 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt { std::string name_space_; std::string plugin_base_; }; +#endif template class TrtPluginRegistrarV2 { @@ -203,8 +313,6 @@ class TrtPluginRegistrarV2 { static paddle::inference::tensorrt::plugin::TrtPluginRegistrarV2 \ plugin_registrar_##name {} -#endif - } // namespace plugin } // namespace tensorrt } // namespace inference diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu new file mode 100644 index 0000000000000000000000000000000000000000..13d07e774036a48b0ed6e3c91b168eaab4461df5 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu @@ -0,0 +1,401 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" +#include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h" +#include "paddle/fluid/operators/detection/yolo_box_op.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +YoloBoxPlugin::YoloBoxPlugin(const nvinfer1::DataType data_type, + const std::vector& anchors, + const int class_num, const float conf_thresh, + const int downsample_ratio, const bool clip_bbox, + const float scale_x_y, const int input_h, + const int input_w) + : data_type_(data_type), + class_num_(class_num), + conf_thresh_(conf_thresh), + downsample_ratio_(downsample_ratio), + clip_bbox_(clip_bbox), + scale_x_y_(scale_x_y), + input_h_(input_h), + input_w_(input_w) { + anchors_.insert(anchors_.end(), anchors.cbegin(), anchors.cend()); + assert(data_type_ == nvinfer1::DataType::kFLOAT || + data_type_ == nvinfer1::DataType::kHALF); + assert(class_num_ > 0); + assert(input_h_ > 0); + assert(input_w_ > 0); + + cudaMalloc(&anchors_device_, anchors.size() * sizeof(int)); + cudaMemcpy(anchors_device_, anchors.data(), anchors.size() * sizeof(int), + cudaMemcpyHostToDevice); +} + +YoloBoxPlugin::YoloBoxPlugin(const void* data, size_t length) { + DeserializeValue(&data, &length, &data_type_); + DeserializeValue(&data, &length, &anchors_); + DeserializeValue(&data, &length, &class_num_); + DeserializeValue(&data, &length, &conf_thresh_); + DeserializeValue(&data, &length, &downsample_ratio_); + DeserializeValue(&data, &length, &clip_bbox_); + DeserializeValue(&data, &length, &scale_x_y_); + DeserializeValue(&data, &length, &input_h_); + DeserializeValue(&data, &length, &input_w_); +} + +YoloBoxPlugin::~YoloBoxPlugin() { + if (anchors_device_ != nullptr) { + cudaFree(anchors_device_); + anchors_device_ = nullptr; + } +} + +const char* YoloBoxPlugin::getPluginType() const { return "yolo_box_plugin"; } + +const char* YoloBoxPlugin::getPluginVersion() const { return "1"; } + +int YoloBoxPlugin::getNbOutputs() const { return 2; } + +nvinfer1::Dims YoloBoxPlugin::getOutputDimensions(int index, + const nvinfer1::Dims* inputs, + int nb_input_dims) { + const int anchor_num = anchors_.size() / 2; + const int box_num = inputs[0].d[1] * inputs[0].d[2] * anchor_num; + + assert(index <= 1); + + if (index == 0) { + return nvinfer1::Dims2(box_num, 4); + } + return nvinfer1::Dims2(box_num, class_num_); +} + +bool YoloBoxPlugin::supportsFormat(nvinfer1::DataType type, + nvinfer1::TensorFormat format) const { + return ((type == data_type_ || type == nvinfer1::DataType::kINT32) && + format == nvinfer1::TensorFormat::kLINEAR); +} + +size_t YoloBoxPlugin::getWorkspaceSize(int max_batch_size) const { return 0; } + +template +__device__ inline T sigmoid(T x) { + return 1. / (1. + exp(-x)); +} + +template <> +__device__ inline float sigmoid(float x) { + return 1.f / (1.f + expf(-x)); +} + +template +__device__ inline void GetYoloBox(float* box, const T* x, const int* anchors, + int i, int j, int an_idx, int grid_size_h, + int grid_size_w, int input_size_h, + int input_size_w, int index, int stride, + int img_height, int img_width, float scale, + float bias) { + box[0] = static_cast( + (i + sigmoid(static_cast(x[index]) * scale + bias)) * img_width / + grid_size_w); + box[1] = static_cast( + (j + sigmoid(static_cast(x[index + stride]) * scale + bias)) * + img_height / grid_size_h); + box[2] = static_cast(expf(static_cast(x[index + 2 * stride])) * + anchors[2 * an_idx] * img_width / input_size_w); + box[3] = + static_cast(expf(static_cast(x[index + 3 * stride])) * + anchors[2 * an_idx + 1] * img_height / input_size_h); +} + +__device__ inline int GetEntryIndex(int batch, int an_idx, int hw_idx, + int an_num, int an_stride, int stride, + int entry) { + return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; +} + +template +__device__ inline void CalcDetectionBox(T* boxes, const float* box, + const int box_idx, const int img_height, + const int img_width, bool clip_bbox) { + float tmp_box_0, tmp_box_1, tmp_box_2, tmp_box_3; + tmp_box_0 = box[0] - box[2] / 2; + tmp_box_1 = box[1] - box[3] / 2; + tmp_box_2 = box[0] + box[2] / 2; + tmp_box_3 = box[1] + box[3] / 2; + + if (clip_bbox) { + tmp_box_0 = max(tmp_box_0, 0.f); + tmp_box_1 = max(tmp_box_1, 0.f); + tmp_box_2 = min(tmp_box_2, static_cast(img_width - 1)); + tmp_box_3 = min(tmp_box_3, static_cast(img_height - 1)); + } + + boxes[box_idx + 0] = static_cast(tmp_box_0); + boxes[box_idx + 1] = static_cast(tmp_box_1); + boxes[box_idx + 2] = static_cast(tmp_box_2); + boxes[box_idx + 3] = static_cast(tmp_box_3); +} + +template +__device__ inline void CalcLabelScore(T* scores, const T* input, + const int label_idx, const int score_idx, + const int class_num, const float conf, + const int stride) { + for (int i = 0; i < class_num; i++) { + scores[score_idx + i] = static_cast( + conf * sigmoid(static_cast(input[label_idx + i * stride]))); + } +} + +template +__global__ void KeYoloBoxFw(const T* const input, const int* const imgsize, + T* boxes, T* scores, const float conf_thresh, + const int* anchors, const int n, const int h, + const int w, const int an_num, const int class_num, + const int box_num, int input_size_h, + int input_size_w, bool clip_bbox, const float scale, + const float bias) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + float box[4]; + for (; tid < n * box_num; tid += stride) { + int grid_num = h * w; + int i = tid / box_num; + int j = (tid % box_num) / grid_num; + int k = (tid % grid_num) / w; + int l = tid % w; + + int an_stride = (5 + class_num) * grid_num; + int img_height = imgsize[2 * i]; + int img_width = imgsize[2 * i + 1]; + + int obj_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4); + float conf = sigmoid(static_cast(input[obj_idx])); + int box_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0); + + if (conf < conf_thresh) { + for (int i = 0; i < 4; ++i) { + box[i] = 0.f; + } + } else { + GetYoloBox(box, input, anchors, l, k, j, h, w, input_size_h, + input_size_w, box_idx, grid_num, img_height, img_width, + scale, bias); + } + + box_idx = (i * box_num + j * grid_num + k * w + l) * 4; + CalcDetectionBox(boxes, box, box_idx, img_height, img_width, clip_bbox); + + int label_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5); + int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num; + CalcLabelScore(scores, input, label_idx, score_idx, class_num, conf, + grid_num); + } +} + +template +int YoloBoxPlugin::enqueue_impl(int batch_size, const void* const* inputs, + void** outputs, void* workspace, + cudaStream_t stream) { + const int n = batch_size; + const int h = input_h_; + const int w = input_w_; + const int an_num = anchors_.size() / 2; + const int box_num = h * w * an_num; + int input_size_h = downsample_ratio_ * h; + int input_size_w = downsample_ratio_ * w; + + float bias = -0.5 * (scale_x_y_ - 1.); + constexpr int threads = 256; + + KeYoloBoxFw<<<(n * box_num + threads - 1) / threads, threads, 0, stream>>>( + reinterpret_cast(inputs[0]), + reinterpret_cast(inputs[1]), + reinterpret_cast(outputs[0]), reinterpret_cast(outputs[1]), + conf_thresh_, anchors_device_, n, h, w, an_num, class_num_, box_num, + input_size_h, input_size_w, clip_bbox_, scale_x_y_, bias); + return cudaGetLastError() != cudaSuccess; +} + +int YoloBoxPlugin::enqueue(int batch_size, const void* const* inputs, + void** outputs, void* workspace, + cudaStream_t stream) { + if (data_type_ == nvinfer1::DataType::kFLOAT) { + return enqueue_impl(batch_size, inputs, outputs, workspace, stream); + } else if (data_type_ == nvinfer1::DataType::kHALF) { + return enqueue_impl(batch_size, inputs, outputs, workspace, stream); + } + assert("unsupported type."); +} + +int YoloBoxPlugin::initialize() { return 0; } + +void YoloBoxPlugin::terminate() {} + +size_t YoloBoxPlugin::getSerializationSize() const { + size_t serialize_size = 0; + serialize_size += SerializedSize(data_type_); + serialize_size += SerializedSize(anchors_); + serialize_size += SerializedSize(class_num_); + serialize_size += SerializedSize(conf_thresh_); + serialize_size += SerializedSize(downsample_ratio_); + serialize_size += SerializedSize(clip_bbox_); + serialize_size += SerializedSize(scale_x_y_); + serialize_size += SerializedSize(input_h_); + serialize_size += SerializedSize(input_w_); + return serialize_size; +} + +void YoloBoxPlugin::serialize(void* buffer) const { + SerializeValue(&buffer, data_type_); + SerializeValue(&buffer, anchors_); + SerializeValue(&buffer, class_num_); + SerializeValue(&buffer, conf_thresh_); + SerializeValue(&buffer, downsample_ratio_); + SerializeValue(&buffer, clip_bbox_); + SerializeValue(&buffer, scale_x_y_); + SerializeValue(&buffer, input_h_); + SerializeValue(&buffer, input_w_); +} + +void YoloBoxPlugin::destroy() {} + +void YoloBoxPlugin::setPluginNamespace(const char* lib_namespace) { + namespace_ = std::string(lib_namespace); +} + +const char* YoloBoxPlugin::getPluginNamespace() const { + return namespace_.c_str(); +} + +nvinfer1::DataType YoloBoxPlugin::getOutputDataType( + int index, const nvinfer1::DataType* input_type, int nb_inputs) const { + return data_type_; +} + +bool YoloBoxPlugin::isOutputBroadcastAcrossBatch(int output_index, + const bool* input_is_broadcast, + int nb_inputs) const { + return false; +} + +bool YoloBoxPlugin::canBroadcastInputAcrossBatch(int input_index) const { + return false; +} + +void YoloBoxPlugin::configurePlugin( + const nvinfer1::Dims* input_dims, int nb_inputs, + const nvinfer1::Dims* output_dims, int nb_outputs, + const nvinfer1::DataType* input_types, + const nvinfer1::DataType* output_types, const bool* input_is_broadcast, + const bool* output_is_broadcast, nvinfer1::PluginFormat float_format, + int max_batct_size) {} + +nvinfer1::IPluginV2Ext* YoloBoxPlugin::clone() const { + return new YoloBoxPlugin(data_type_, anchors_, class_num_, conf_thresh_, + downsample_ratio_, clip_bbox_, scale_x_y_, input_h_, + input_w_); +} + +YoloBoxPluginCreator::YoloBoxPluginCreator() {} + +void YoloBoxPluginCreator::setPluginNamespace(const char* lib_namespace) { + namespace_ = std::string(lib_namespace); +} + +const char* YoloBoxPluginCreator::getPluginNamespace() const { + return namespace_.c_str(); +} + +const char* YoloBoxPluginCreator::getPluginName() const { + return "yolo_box_plugin"; +} + +const char* YoloBoxPluginCreator::getPluginVersion() const { return "1"; } + +const nvinfer1::PluginFieldCollection* YoloBoxPluginCreator::getFieldNames() { + return &field_collection_; +} + +nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) { + const nvinfer1::PluginField* fields = fc->fields; + + int type_id = -1; + std::vector anchors; + int class_num = -1; + float conf_thresh = 0.01; + int downsample_ratio = 32; + bool clip_bbox = true; + float scale_x_y = 1.; + int h = -1; + int w = -1; + + for (int i = 0; i < fc->nbFields; ++i) { + const std::string field_name(fc->fields[i].name); + if (field_name.compare("type_id") == 0) { + type_id = *static_cast(fc->fields[i].data); + } else if (field_name.compare("anchors")) { + const int length = fc->fields[i].length; + const int* data = static_cast(fc->fields[i].data); + anchors.insert(anchors.end(), data, data + length); + } else if (field_name.compare("class_num")) { + class_num = *static_cast(fc->fields[i].data); + } else if (field_name.compare("conf_thresh")) { + conf_thresh = *static_cast(fc->fields[i].data); + } else if (field_name.compare("downsample_ratio")) { + downsample_ratio = *static_cast(fc->fields[i].data); + } else if (field_name.compare("clip_bbox")) { + clip_bbox = *static_cast(fc->fields[i].data); + } else if (field_name.compare("scale_x_y")) { + scale_x_y = *static_cast(fc->fields[i].data); + } else if (field_name.compare("h")) { + h = *static_cast(fc->fields[i].data); + } else if (field_name.compare("w")) { + w = *static_cast(fc->fields[i].data); + } else { + assert(false && "unknown plugin field name."); + } + } + + return new YoloBoxPlugin( + type_id ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, anchors, + class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, h, w); +} + +nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::deserializePlugin( + const char* name, const void* serial_data, size_t serial_length) { + auto plugin = new YoloBoxPlugin(serial_data, serial_length); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h new file mode 100644 index 0000000000000000000000000000000000000000..8ca21da7ae0377164cbb50c502f0abb5ca943058 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h @@ -0,0 +1,117 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class YoloBoxPlugin : public nvinfer1::IPluginV2Ext { + public: + explicit YoloBoxPlugin(const nvinfer1::DataType data_type, + const std::vector& anchors, const int class_num, + const float conf_thresh, const int downsample_ratio, + const bool clip_bbox, const float scale_x_y, + const int input_h, const int input_w); + YoloBoxPlugin(const void* data, size_t length); + ~YoloBoxPlugin() override; + + const char* getPluginType() const override; + const char* getPluginVersion() const override; + int getNbOutputs() const override; + nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, + int nb_input_dims) override; + bool supportsFormat(nvinfer1::DataType type, + nvinfer1::TensorFormat format) const override; + size_t getWorkspaceSize(int max_batch_size) const override; + int enqueue(int batch_size, const void* const* inputs, void** outputs, + void* workspace, cudaStream_t stream) override; + template + int enqueue_impl(int batch_size, const void* const* inputs, void** outputs, + void* workspace, cudaStream_t stream); + int initialize() override; + void terminate() override; + size_t getSerializationSize() const override; + void serialize(void* buffer) const override; + void destroy() override; + void setPluginNamespace(const char* lib_namespace) override; + const char* getPluginNamespace() const override; + + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* input_type, + int nb_inputs) const override; + bool isOutputBroadcastAcrossBatch(int output_index, + const bool* input_is_broadcast, + int nb_inputs) const override; + bool canBroadcastInputAcrossBatch(int input_index) const override; + void configurePlugin(const nvinfer1::Dims* input_dims, int nb_inputs, + const nvinfer1::Dims* output_dims, int nb_outputs, + const nvinfer1::DataType* input_types, + const nvinfer1::DataType* output_types, + const bool* input_is_broadcast, + const bool* output_is_broadcast, + nvinfer1::PluginFormat float_format, + int max_batct_size) override; + nvinfer1::IPluginV2Ext* clone() const override; + + private: + nvinfer1::DataType data_type_; + std::vector anchors_; + int* anchors_device_; + int class_num_; + float conf_thresh_; + int downsample_ratio_; + bool clip_bbox_; + float scale_x_y_; + int input_h_; + int input_w_; + std::string namespace_; +}; + +class YoloBoxPluginCreator : public nvinfer1::IPluginCreator { + public: + YoloBoxPluginCreator(); + ~YoloBoxPluginCreator() override = default; + + void setPluginNamespace(const char* lib_namespace) override; + const char* getPluginNamespace() const override; + const char* getPluginName() const override; + const char* getPluginVersion() const override; + const nvinfer1::PluginFieldCollection* getFieldNames() override; + + nvinfer1::IPluginV2Ext* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override; + nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override; + + private: + std::string namespace_; + nvinfer1::PluginFieldCollection field_collection_; +}; + +REGISTER_TRT_PLUGIN_V2(YoloBoxPluginCreator); + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 92f9c20a369d7a6e295b52d66fe61c244fafe943..f74cd671d6dca0cd52bb595f6ee1370b464d9e30 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -522,15 +522,15 @@ if(WITH_GPU AND TENSORRT_FOUND) inference_analysis_test(trt_instance_norm_test SRCS trt_instance_norm_converter_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TEST_INSTANCE_NORM_MODEL}/) - inference_analysis_test(test_analyzer_capi_gpu SRCS analyzer_capi_gpu_tester.cc + inference_analysis_test(test_analyzer_capi_exp_gpu SRCS analyzer_capi_exp_gpu_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) - inference_analysis_test(test_analyzer_capi_xpu SRCS analyzer_capi_xpu_tester.cc + inference_analysis_test(test_analyzer_capi_exp_xpu SRCS analyzer_capi_exp_xpu_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model") - if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR}/small_quant_model.tgz) + if (NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model.tgz) inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz") endif() inference_analysis_test(trt_quant_int8_test SRCS trt_quant_int8_test.cc @@ -538,7 +538,7 @@ if(WITH_GPU AND TENSORRT_FOUND) ARGS --infer_model=${TRT_MODEL_QUANT_RESNET_DIR}) set(TRT_MODEL_QUANT_YOLOV3_DIR "${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware") - if (NOT EXISTS ${TRT_MODEL_QUANT_YOLOV3_DIR}/yolov3_r50_quant_aware.tgz) + if (NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware.tgz) inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "yolov3_r50_quant_aware.tgz") endif() inference_analysis_test(trt_quant_int8_yolov3_r50_test SRCS trt_quant_int8_yolov3_r50_test.cc @@ -576,8 +576,7 @@ if(WITH_GPU AND TENSORRT_FOUND) EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune) - set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/") - if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL}/ernie_model_4_unserialized.tgz) + if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized.tgz) inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz") endif() @@ -585,8 +584,7 @@ if(WITH_GPU AND TENSORRT_FOUND) EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized) - set(TEST_TRT_ERNIE_UNSER_FP16_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_fp16_unserialized/") - if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_FP16_MODEL}/ernie_model_4_unserialized.tgz) + if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized.tgz) inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_fp16_unserialized.tgz") endif() @@ -606,14 +604,23 @@ inference_analysis_test(lite_resnet50_test SRCS lite_resnet50_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${RESNET50_MODEL_DIR}) -inference_analysis_test(test_analyzer_capi SRCS analyzer_capi_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c - ARGS --infer_model=${RESNET50_MODEL_DIR}/model) +inference_analysis_test(test_analyzer_capi_exp SRCS analyzer_capi_exp_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c + ARGS --infer_model=${RESNET50_MODEL_DIR}/model) + +inference_analysis_test(test_analyzer_capi_exp_pd_config SRCS analyzer_capi_exp_pd_config_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c + ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) + +inference_analysis_test(test_analyzer_capi_exp_pd_tensor SRCS analyzer_capi_exp_pd_tensor_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c + ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) -inference_analysis_test(test_analyzer_capi_pd_tensor SRCS analyzer_capi_pd_tensor_tester.cc +if (NOT APPLE AND NOT WIN32) + inference_analysis_test(test_analyzer_capi_exp_pd_threads SRCS analyzer_capi_exp_pd_threads_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) - +endif() inference_analysis_test(test_analyzer_zerocopytensor_tensor SRCS analyzer_zerocopy_tensor_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${OCR_INSTALL_DIR}/model) @@ -623,17 +630,17 @@ inference_analysis_test(test_analyzer_paddletensor_tensor SRCS analyzer_paddle_t ARGS --infer_model=${OCR_INSTALL_DIR}/model --infer_data=${OCR_INSTALL_DIR}/data.txt --refer_result=${OCR_INSTALL_DIR}/result.txt) if(WITH_MKLDNN) - inference_analysis_test(test_analyzer_capi_int SRCS analyzer_capi_int_tester.cc + inference_analysis_test(test_analyzer_capi_exp_int SRCS analyzer_capi_exp_int_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c ARGS --infer_model=${INT8_DATA_DIR}/resnet50/model) - endif() +endif() -inference_analysis_test(test_analyzer_capi_ner SRCS analyzer_capi_ner_tester.cc +inference_analysis_test(test_analyzer_capi_exp_ner SRCS analyzer_capi_exp_ner_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model) if(WITH_GPU) - inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc + inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${RESNET50_MODEL_DIR}) endif() diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..de9e2afd705f9366e3e703abab517b85be766018 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc @@ -0,0 +1,160 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +TEST(PD_Config, gpu_interface) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + std::string prog_file = model_dir + "/__model__"; + std::string param_file = model_dir + "/__params__"; + std::string opt_cache_dir = FLAGS_infer_model + "/OptimCacheDir"; + const char* ops_name = "conv_2d"; + + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, prog_file.c_str(), param_file.c_str()); + PD_ConfigSetOptimCacheDir(config, opt_cache_dir.c_str()); + + PD_ConfigEnableUseGpu(config, 100, 0); + bool use_gpu = PD_ConfigUseGpu(config); + EXPECT_TRUE(use_gpu); + int init_size = PD_ConfigMemoryPoolInitSizeMb(config); + EXPECT_EQ(init_size, 100); + int gpu_device_id = PD_ConfigGpuDeviceId(config); + EXPECT_EQ(gpu_device_id, 0); + float frac = PD_ConfigFractionOfGpuMemoryForPool(config); + LOG(INFO) << frac; + PD_ConfigEnableCudnn(config); + bool cudnn = PD_ConfigCudnnEnabled(config); + EXPECT_TRUE(cudnn); + + PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_INT8, FALSE, + TRUE); + bool trt_enable = PD_ConfigTensorRtEngineEnabled(config); + EXPECT_TRUE(trt_enable); + + const char* tensor_name = "image"; + size_t shapes_num[1] = {4}; + int32_t min_shape[4] = {1, 3, 36, 36}; + int32_t max_shape[4] = {1, 3, 224, 224}; + int32_t opt_shape[4] = {1, 3, 224, 224}; + int32_t* min_shape_ptr = min_shape; + int32_t* max_shape_ptr = max_shape; + int32_t* opt_shape_ptr = opt_shape; + PD_ConfigSetTrtDynamicShapeInfo(config, 1, &tensor_name, shapes_num, + &min_shape_ptr, &max_shape_ptr, + &opt_shape_ptr, FALSE); + PD_ConfigDisableTensorRtOPs(config, 1, &ops_name); + PD_ConfigEnableTensorRtOSS(config); + bool oss_enabled = PD_ConfigTensorRtOssEnabled(config); + EXPECT_TRUE(oss_enabled); + + PD_ConfigEnableTensorRtDla(config, 4); + bool dla_enabled = PD_ConfigTensorRtDlaEnabled(config); + EXPECT_TRUE(dla_enabled); + + PD_ConfigEnableGpuMultiStream(config); + bool thread_local_thread = PD_ConfigThreadLocalStreamEnabled(config); + EXPECT_TRUE(thread_local_thread); + + PD_ConfigDisableGpu(config); + PD_ConfigDestroy(config); +} + +TEST(PD_Config, use_gpu) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + PD_Config* config = PD_ConfigCreate(); + + PD_ConfigDisableGpu(config); + PD_ConfigSetCpuMathLibraryNumThreads(config, 10); + int num_thread = PD_ConfigGetCpuMathLibraryNumThreads(config); + EXPECT_EQ(num_thread, 10); + + PD_ConfigSwitchIrDebug(config, TRUE); + PD_ConfigSetModelDir(config, model_dir.c_str()); + PD_ConfigSetOptimCacheDir(config, + (FLAGS_infer_model + "/OptimCacheDir").c_str()); + const char* model_dir_ = PD_ConfigGetModelDir(config); + LOG(INFO) << model_dir_; + + PD_ConfigEnableUseGpu(config, 100, 0); + bool use_gpu = PD_ConfigUseGpu(config); + EXPECT_TRUE(use_gpu); + int device_id = PD_ConfigGpuDeviceId(config); + EXPECT_EQ(device_id, 0); + int init_size = PD_ConfigMemoryPoolInitSizeMb(config); + EXPECT_EQ(init_size, 100); + + float frac = PD_ConfigFractionOfGpuMemoryForPool(config); + LOG(INFO) << frac; + + PD_ConfigEnableCudnn(config); + bool cudnn = PD_ConfigCudnnEnabled(config); + EXPECT_TRUE(cudnn); + + PD_ConfigSwitchIrOptim(config, TRUE); + bool ir_optim = PD_ConfigIrOptim(config); + EXPECT_TRUE(ir_optim); + + PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_FLOAT32, + FALSE, FALSE); + bool trt_enable = PD_ConfigTensorRtEngineEnabled(config); + EXPECT_TRUE(trt_enable); + PD_ConfigEnableMemoryOptim(config); + bool memory_optim_enable = PD_ConfigMemoryOptimEnabled(config); + EXPECT_TRUE(memory_optim_enable); + PD_ConfigEnableProfile(config); + bool profiler_enable = PD_ConfigProfileEnabled(config); + EXPECT_TRUE(profiler_enable); + PD_ConfigSetInvalid(config); + bool is_valid = PD_ConfigIsValid(config); + EXPECT_FALSE(is_valid); + PD_ConfigDestroy(config); +} + +TEST(PD_Config, trt_int8) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigEnableUseGpu(config, 100, 0); + PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_INT8, FALSE, + TRUE); + bool trt_enable = PD_ConfigTensorRtEngineEnabled(config); + EXPECT_TRUE(trt_enable); + PD_ConfigDestroy(config); +} + +TEST(PD_Config, trt_fp16) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigEnableUseGpu(config, 100, 0); + PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_HALF, FALSE, + FALSE); + bool trt_enable = PD_ConfigTensorRtEngineEnabled(config); + EXPECT_TRUE(trt_enable); + PD_Predictor* predictor = PD_PredictorCreate(config); + PD_PredictorDestroy(predictor); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..d3a15cb285772d5cc75a460c388223c8da663119 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void predictor_run() { + std::string model_dir = FLAGS_infer_model; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigDisableGpu(config); + PD_ConfigSetCpuMathLibraryNumThreads(config, 10); + PD_ConfigSwitchIrDebug(config, TRUE); + PD_ConfigSetModelDir(config, model_dir.c_str()); + PD_Predictor* predictor = PD_PredictorCreate(config); + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor); + LOG(INFO) << "The inputs' size is: " << input_names->size; + EXPECT_EQ(input_names->size, 2u); + + int32_t shape_0[4] = {1, 3, 224, 224}; + float data_0[1 * 3 * 224 * 224] = {0}; + PD_Tensor* input_0 = PD_PredictorGetInputHandle(predictor, "image"); + PD_TensorReshape(input_0, 4, shape_0); + PD_TensorCopyFromCpuFloat(input_0, data_0); + int32_t shape_1[2] = {1, 1}; + int64_t data_1[1] = {0}; + PD_Tensor* input_1 = PD_PredictorGetInputHandle(predictor, "label"); + PD_TensorReshape(input_1, 2, shape_1); + PD_TensorCopyFromCpuInt64(input_1, data_1); + + LOG(INFO) << "Run Inference in CAPI encapsulation. "; + EXPECT_TRUE(PD_PredictorRun(predictor)); + + PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor); + LOG(INFO) << "output size is: " << output_names->size; + for (size_t index = 0; index < output_names->size; ++index) { + LOG(INFO) << "output[" << index + << "]'s name is: " << output_names->data[index]; + PD_Tensor* output = + PD_PredictorGetOutputHandle(predictor, output_names->data[index]); + PD_OneDimArrayInt32* shape = PD_TensorGetShape(output); + LOG(INFO) << "output[" << index << "]'s shape_size is: " << shape->size; + int32_t out_size = 1; + for (size_t i = 0; i < shape->size; ++i) { + LOG(INFO) << "output[" << index << "]'s shape is: " << shape->data[i]; + out_size = out_size * shape->data[i]; + } + float* out_data = new float[out_size]; + PD_TensorCopyToCpuFloat(output, out_data); + LOG(INFO) << "output[" << index << "]'s DATA is: " << out_data[0]; + delete[] out_data; + PD_OneDimArrayInt32Destroy(shape); + PD_TensorDestroy(output); + } + PD_PredictorClearIntermediateTensor(predictor); + PD_PredictorTryShrinkMemory(predictor); + PD_OneDimArrayCstrDestroy(output_names); + PD_TensorDestroy(input_1); + PD_TensorDestroy(input_0); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} + +#ifdef PADDLE_WITH_MKLDNN +TEST(PD_PredictorRun, predictor_run) { predictor_run(); } +#endif + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..4369cd78dfa3746677d0916ad3a5c106da412ff0 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc @@ -0,0 +1,105 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +TEST(PD_PredictorRun, predictor_run) { + auto model_dir = FLAGS_infer_model; + PD_Config *config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/param").c_str()); + PD_ConfigDisableGpu(config); + + PD_Predictor *predictor = PD_PredictorCreate(config); + size_t input_num = PD_PredictorGetInputNum(predictor); + LOG(INFO) << "Input num: " << input_num; + size_t output_num = PD_PredictorGetOutputNum(predictor); + LOG(INFO) << "Output num: " << output_num; + + PD_OneDimArrayCstr *input_names = PD_PredictorGetInputNames(predictor); + EXPECT_EQ(input_names->size, 2u); + LOG(INFO) << "Predictor start run!"; + PD_Tensor *inputs[2]; + inputs[0] = PD_PredictorGetInputHandle(predictor, input_names->data[0]); + inputs[1] = PD_PredictorGetInputHandle(predictor, input_names->data[1]); + LOG(INFO) << "Predictor start run!"; + // inputs[0]: word, use lod memory in stack + int32_t shape_0[2] = {11, 1}; + int64_t data_0[11 * 1] = {12673, 9763, 905, 284, 45, 7474, 20, 17, 1, 4, 9}; + size_t lod_layer_0[2] = {0, 11}; + PD_OneDimArraySize layer_0; + layer_0.size = 2; + layer_0.data = lod_layer_0; + PD_OneDimArraySize *layer_0_ptr = &layer_0; + PD_TwoDimArraySize lod_0; + lod_0.size = 1; + lod_0.data = &layer_0_ptr; + PD_TensorReshape(inputs[0], 2, shape_0); + PD_TensorCopyFromCpuInt64(inputs[0], data_0); + PD_TensorSetLod(inputs[0], &lod_0); + + // inputs[1]: mention, use lod memory in heap + int32_t shape_1[2] = {11, 1}; + int64_t data_1[11 * 1] = {27, 0, 0, 33, 34, 33, 0, 0, 0, 1, 2}; + PD_TwoDimArraySize *lod_1_ptr = new PD_TwoDimArraySize(); + lod_1_ptr->size = 1; + lod_1_ptr->data = new PD_OneDimArraySize *[1]; + lod_1_ptr->data[0] = new PD_OneDimArraySize(); + lod_1_ptr->data[0]->size = 2; + lod_1_ptr->data[0]->data = new size_t[2]; + lod_1_ptr->data[0]->data[0] = 0; + lod_1_ptr->data[0]->data[1] = 11; + + PD_TensorReshape(inputs[1], 2, shape_1); + PD_TensorCopyFromCpuInt64(inputs[1], data_1); + PD_TensorSetLod(inputs[1], lod_1_ptr); + // retrieve the lod memory + delete[] lod_1_ptr->data[0]->data; + delete lod_1_ptr->data[0]; + delete[] lod_1_ptr->data; + delete lod_1_ptr; + lod_1_ptr = nullptr; + + LOG(INFO) << "Predictor start run!"; + bool success = PD_PredictorRun(predictor); + EXPECT_TRUE(success); + LOG(INFO) << "Predictor run success!"; + PD_OneDimArrayCstr *output_names = PD_PredictorGetOutputNames(predictor); + PD_Tensor *output = + PD_PredictorGetOutputHandle(predictor, output_names->data[0]); + PD_TwoDimArraySize *output_lod = PD_TensorGetLod(output); + + PD_TwoDimArraySizeDestroy(output_lod); + PD_TensorDestroy(output); + PD_OneDimArrayCstrDestroy(output_names); + + PD_TensorDestroy(inputs[0]); + PD_TensorDestroy(inputs[1]); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..18107704ae420f5eacbbac885f77dc7ed042b73f --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +TEST(PD_Config, interface) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + std::string prog_file = model_dir + "/__model__"; + std::string param_file = model_dir + "/__params__"; + std::string opt_cache_dir = FLAGS_infer_model + "/OptimCacheDir"; + + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModelDir(config, model_dir.c_str()); + std::string model_dir_ = PD_ConfigGetModelDir(config); + EXPECT_EQ(model_dir, model_dir_); + + PD_ConfigSetModel(config, prog_file.c_str(), param_file.c_str()); + PD_ConfigSetProgFile(config, prog_file.c_str()); + PD_ConfigSetParamsFile(config, param_file.c_str()); + PD_ConfigSetOptimCacheDir(config, opt_cache_dir.c_str()); + std::string prog_file_ = PD_ConfigGetProgFile(config); + std::string param_file_ = PD_ConfigGetParamsFile(config); + EXPECT_EQ(prog_file, prog_file_); + EXPECT_EQ(param_file, param_file_); + + PD_ConfigDisableFCPadding(config); + bool fc_padding = PD_ConfigUseFcPadding(config); + EXPECT_FALSE(fc_padding); + + PD_ConfigDisableGpu(config); + PD_ConfigSwitchIrOptim(config, TRUE); + bool ir_optim = PD_ConfigIrOptim(config); + EXPECT_TRUE(ir_optim); + +#ifndef PADDLE_WITH_LITE + PD_ConfigEnableLiteEngine(config, PD_PRECISION_FLOAT32, TRUE, 0, nullptr, 0, + nullptr); + bool lite_enabled = PD_ConfigLiteEngineEnabled(config); + EXPECT_TRUE(lite_enabled); +#endif + + PD_ConfigSwitchIrDebug(config, TRUE); +#ifdef PADDLE_WITH_MKLDNN + const char* ops_name = "conv_2d"; + PD_ConfigEnableMKLDNN(config); + PD_ConfigSetMkldnnOp(config, 1, &ops_name); + PD_ConfigSetMkldnnCacheCapacity(config, 100); + bool mkldnn_enabled = PD_ConfigMkldnnEnabled(config); + EXPECT_TRUE(mkldnn_enabled); + + PD_ConfigSetCpuMathLibraryNumThreads(config, 10); + int32_t cpu_threads = PD_ConfigGetCpuMathLibraryNumThreads(config); + EXPECT_EQ(cpu_threads, 10); + + PD_ConfigEnableMkldnnQuantizer(config); + bool mkldnn_qt_enabled = PD_ConfigMkldnnQuantizerEnabled(config); + EXPECT_TRUE(mkldnn_qt_enabled); + + PD_ConfigEnableMkldnnBfloat16(config); + PD_ConfigSetBfloat16Op(config, 1, &ops_name); + bool mkldnn_bf16_enabled = PD_ConfigMkldnnBfloat16Enabled(config); + EXPECT_TRUE(mkldnn_bf16_enabled); +#endif + + PD_ConfigEnableMemoryOptim(config); + bool memory_enabled = PD_ConfigMemoryOptimEnabled(config); + EXPECT_TRUE(memory_enabled); + + PD_ConfigEnableProfile(config); + bool profile_enabled = PD_ConfigProfileEnabled(config); + EXPECT_TRUE(profile_enabled); + + PD_ConfigDisableGlogInfo(config); + bool glog_diabled = PD_ConfigGlogInfoDisabled(config); + EXPECT_TRUE(glog_diabled); + + PD_ConfigSetInvalid(config); + bool is_valid = PD_ConfigIsValid(config); + EXPECT_FALSE(is_valid); + + PD_ConfigPartiallyRelease(config); + PD_ConfigDestroy(config); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..f4017fc5a7f3408b99bb664d52192f7cb35f9144 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc @@ -0,0 +1,196 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void PD_run() { + auto model_dir = FLAGS_infer_model; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/__params__").c_str()); + PD_Predictor* predictor = PD_PredictorCreate(config); + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor); + PD_Tensor* tensor = + PD_PredictorGetInputHandle(predictor, input_names->data[0]); + + int32_t shapes[4] = {1, 3, 300, 300}; + std::vector input(1 * 3 * 300 * 300, 0); + int32_t size; + PD_PlaceType place; + PD_TensorReshape(tensor, 4, shapes); + PD_TensorCopyFromCpuFloat(tensor, input.data()); + PD_TensorDataFloat(tensor, &place, &size); + PD_TensorMutableDataFloat(tensor, place); + + PD_TwoDimArraySize lod; + lod.size = 0; + lod.data = NULL; + PD_TensorSetLod(tensor, &lod); + + PD_PredictorRun(predictor); + + std::vector out_data; + PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor); + PD_Tensor* output_tensor = + PD_PredictorGetOutputHandle(predictor, output_names->data[0]); + PD_OneDimArrayInt32* output_shape = PD_TensorGetShape(output_tensor); + int32_t out_num = std::accumulate(output_shape->data, + output_shape->data + output_shape->size, 1, + std::multiplies()); + out_data.resize(out_num); + PD_TensorCopyToCpuFloat(output_tensor, out_data.data()); + LOG(INFO) << "Output tensor name is: " << PD_TensorGetName(output_tensor); + PD_DataType data_type = PD_TensorGetDataType(output_tensor); + EXPECT_EQ(data_type, PD_DATA_FLOAT32); + + PD_TwoDimArraySize* out_lod = PD_TensorGetLod(output_tensor); + + PD_TwoDimArraySizeDestroy(out_lod); + PD_OneDimArrayInt32Destroy(output_shape); + PD_TensorDestroy(output_tensor); + PD_OneDimArrayCstrDestroy(output_names); + PD_TensorDestroy(tensor); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} +TEST(PD_Tensor, PD_run) { PD_run(); } + +TEST(PD_Tensor, int32) { + auto model_dir = FLAGS_infer_model; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/__params__").c_str()); + PD_Predictor* predictor = PD_PredictorCreate(config); + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor); + PD_Tensor* tensor = + PD_PredictorGetInputHandle(predictor, input_names->data[0]); + int32_t shapes[4] = {1, 3, 300, 300}; + std::vector input(1 * 3 * 300 * 300, 0); + int32_t size; + PD_PlaceType place; + PD_TensorReshape(tensor, 4, shapes); + PD_TensorCopyFromCpuInt32(tensor, input.data()); + int32_t* data_ptr = PD_TensorDataInt32(tensor, &place, &size); + EXPECT_EQ(place, PD_PLACE_CPU); + EXPECT_EQ(size, 1 * 3 * 300 * 300); + int32_t* mutable_data_ptr = PD_TensorMutableDataInt32(tensor, place); + EXPECT_EQ(data_ptr, mutable_data_ptr); + + PD_DataType data_type = PD_TensorGetDataType(tensor); + EXPECT_EQ(data_type, PD_DATA_INT32); + PD_TensorCopyToCpuInt32(tensor, input.data()); + + PD_TensorDestroy(tensor); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} + +TEST(PD_Tensor, int64) { + auto model_dir = FLAGS_infer_model; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/__params__").c_str()); + PD_Predictor* predictor = PD_PredictorCreate(config); + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor); + PD_Tensor* tensor = + PD_PredictorGetInputHandle(predictor, input_names->data[0]); + int32_t shapes[4] = {1, 3, 300, 300}; + std::vector input(1 * 3 * 300 * 300, 0); + int32_t size; + PD_PlaceType place; + PD_TensorReshape(tensor, 4, shapes); + PD_TensorCopyFromCpuInt64(tensor, input.data()); + int64_t* data_ptr = PD_TensorDataInt64(tensor, &place, &size); + EXPECT_EQ(place, PD_PLACE_CPU); + EXPECT_EQ(size, 1 * 3 * 300 * 300); + int64_t* mutable_data_ptr = PD_TensorMutableDataInt64(tensor, place); + EXPECT_EQ(data_ptr, mutable_data_ptr); + + PD_DataType data_type = PD_TensorGetDataType(tensor); + EXPECT_EQ(data_type, PD_DATA_INT64); + PD_TensorCopyToCpuInt64(tensor, input.data()); + + PD_TensorDestroy(tensor); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} + +TEST(PD_Tensor, uint8) { + auto model_dir = FLAGS_infer_model; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/__params__").c_str()); + PD_Predictor* predictor = PD_PredictorCreate(config); + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor); + PD_Tensor* tensor = + PD_PredictorGetInputHandle(predictor, input_names->data[0]); + int32_t shapes[4] = {1, 3, 300, 300}; + uint8_t input[1 * 3 * 300 * 300] = {0}; + int32_t size; + PD_PlaceType place; + PD_TensorReshape(tensor, 4, shapes); + PD_TensorCopyFromCpuUint8(tensor, input); + uint8_t* data_ptr = PD_TensorDataUint8(tensor, &place, &size); + EXPECT_EQ(place, PD_PLACE_CPU); + EXPECT_EQ(size, 1 * 3 * 300 * 300); + uint8_t* mutable_data_ptr = PD_TensorMutableDataUint8(tensor, place); + EXPECT_EQ(data_ptr, mutable_data_ptr); + + PD_DataType data_type = PD_TensorGetDataType(tensor); + EXPECT_EQ(data_type, PD_DATA_UINT8); + PD_TensorCopyToCpuUint8(tensor, input); + + PD_TensorDestroy(tensor); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} + +std::string read_file(std::string filename) { + std::ifstream file(filename); + return std::string((std::istreambuf_iterator(file)), + std::istreambuf_iterator()); +} + +TEST(PD_Tensor, from_buffer) { + PD_Config* config = PD_ConfigCreate(); + std::string prog_file = FLAGS_infer_model + "/__model__"; + std::string params_file = FLAGS_infer_model + "/__params__"; + + std::string prog_str = read_file(prog_file); + std::string params_str = read_file(params_file); + + PD_ConfigSetModelBuffer(config, prog_str.c_str(), prog_str.size(), + params_str.c_str(), params_str.size()); + + bool model_from_memory = PD_ConfigModelFromMemory(config); + EXPECT_TRUE(model_from_memory); + PD_ConfigDestroy(config); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..8951c446b1f83a952abaf55767a3cea52d8f0463 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc @@ -0,0 +1,119 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +typedef struct RunParameter { + PD_Predictor* predictor; + int32_t* shapes; + size_t shape_size; + float* input_data; + int32_t out_size; + float* out_data; + int32_t thread_index; +} RunParameter; + +void* run(void* thread_param) { + struct RunParameter* param = (struct RunParameter*)thread_param; + LOG(INFO) << "Thread " << param->thread_index << " start run!"; + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(param->predictor); + PD_Tensor* tensor = + PD_PredictorGetInputHandle(param->predictor, input_names->data[0]); + PD_TensorReshape(tensor, param->shape_size, param->shapes); + PD_TensorCopyFromCpuFloat(tensor, param->input_data); + PD_PredictorRun(param->predictor); + PD_OneDimArrayCstr* output_names = + PD_PredictorGetOutputNames(param->predictor); + PD_Tensor* output_tensor = + PD_PredictorGetOutputHandle(param->predictor, output_names->data[0]); + PD_OneDimArrayInt32* output_shape = PD_TensorGetShape(output_tensor); + param->out_size = 1; + for (size_t index = 0; index < output_shape->size; ++index) { + param->out_size = param->out_size * output_shape->data[index]; + } + PD_OneDimArrayInt32Destroy(output_shape); + param->out_data = + reinterpret_cast(malloc(param->out_size * sizeof(float))); + PD_TensorCopyToCpuFloat(output_tensor, param->out_data); + PD_TensorDestroy(output_tensor); + PD_OneDimArrayCstrDestroy(output_names); + PD_TensorDestroy(tensor); + PD_OneDimArrayCstrDestroy(input_names); + LOG(INFO) << "Thread " << param->thread_index << " end run!"; + return NULL; +} +void threads_run(int thread_num) { + auto model_dir = FLAGS_infer_model; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/__params__").c_str()); + PD_Predictor* predictor = PD_PredictorCreate(config); + + pthread_t* threads = + reinterpret_cast(malloc(thread_num * sizeof(pthread_t))); + RunParameter* params = reinterpret_cast( + malloc(thread_num * sizeof(RunParameter))); + int32_t shapes[4] = {1, 3, 300, 300}; + float* input = + reinterpret_cast(malloc(1 * 3 * 300 * 300 * sizeof(float))); + memset(input, 0, 1 * 3 * 300 * 300 * sizeof(float)); + for (int i = 0; i < thread_num; ++i) { + params[i].predictor = PD_PredictorClone(predictor); + params[i].shapes = shapes; + params[i].shape_size = 4; + params[i].input_data = input; + params[i].out_size = 0; + params[i].out_data = NULL; + params[i].thread_index = i; + pthread_create(&(threads[i]), NULL, run, (params + i)); + } + for (int i = 0; i < thread_num; ++i) { + pthread_join(threads[i], NULL); + } + ASSERT_GT(params[0].out_size, 0); + + for (int i = 1; i < thread_num; ++i) { + ASSERT_EQ(params[i].out_size, params[0].out_size); + for (int j = 0; j < params[i].out_size; ++j) { + ASSERT_EQ(params[i].out_data[j], params[0].out_data[j]); + } + } + for (int i = 0; i < thread_num; ++i) { + PD_PredictorDestroy(params[i].predictor); + free(params[i].out_data); + } + free(input); + free(params); + free(threads); + PD_PredictorDestroy(predictor); +} + +TEST(PD_Predictor, PD_multi_threads_run) { threads_run(10); } + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..11de1a5a6fab4f38bb57ff2af451d55658d0cbca --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc @@ -0,0 +1,83 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include + +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void predictor_run() { + std::string model_dir = FLAGS_infer_model; + std::string prog_file = model_dir + "/model"; + std::string params_file = model_dir + "/params"; + PD_Config *config = PD_ConfigCreate(); + PD_ConfigDisableGpu(config); + PD_ConfigSetCpuMathLibraryNumThreads(config, 10); + PD_ConfigSwitchIrDebug(config, TRUE); + PD_ConfigSetModel(config, prog_file.c_str(), params_file.c_str()); + + PD_Predictor *predictor = PD_PredictorCreate(config); + PD_Tensor *tensor = PD_PredictorGetInputHandle(predictor, "data"); + + const int batch_size = 1; + const int channels = 3; + const int height = 318; + const int width = 318; + float *input = new float[batch_size * channels * height * width](); + + int32_t shape[4] = {batch_size, channels, height, width}; + PD_TensorReshape(tensor, 4, shape); + PD_TensorCopyFromCpuFloat(tensor, input); + EXPECT_TRUE(PD_PredictorRun(predictor)); + + delete[] input; + PD_TensorDestroy(tensor); + PD_PredictorDestroy(predictor); +} + +TEST(PD_PredictorRun, predictor_run) { predictor_run(); } + +#ifdef PADDLE_WITH_MKLDNN +TEST(PD_Config, profile_mkldnn) { + std::string model_dir = FLAGS_infer_model; + std::string prog_file = model_dir + "/model"; + std::string params_file = model_dir + "/params"; + PD_Config *config = PD_ConfigCreate(); + PD_ConfigDisableGpu(config); + PD_ConfigSetCpuMathLibraryNumThreads(config, 10); + PD_ConfigSwitchIrDebug(config, TRUE); + PD_ConfigEnableMKLDNN(config); + bool mkldnn_enable = PD_ConfigMkldnnEnabled(config); + EXPECT_TRUE(mkldnn_enable); + PD_ConfigEnableMkldnnQuantizer(config); + bool quantizer_enable = PD_ConfigMkldnnQuantizerEnabled(config); + EXPECT_TRUE(quantizer_enable); + PD_ConfigEnableMkldnnBfloat16(config); + PD_ConfigSetMkldnnCacheCapacity(config, 0); + PD_ConfigSetModel(config, prog_file.c_str(), params_file.c_str()); + PD_ConfigDestroy(config); +} +#endif + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..f4fd04e85840de8fdf7afa2903fe52b023bab644 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc @@ -0,0 +1,60 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +#ifdef PADDLE_WITH_XPU +TEST(PD_Config, use_xpu) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + PD_Config *config = PD_Config(); + PD_ConfigSwitchIrDebug(config, TRUE); + PD_ConfigSetModelDir(config, model_dir.c_str()); + PD_ConfigSetOptimCacheDir(config, + (FLAGS_infer_model + "/OptimCacheDir").c_str()); + const char *model_dir_ = PD_ConfigGetModelDir(config); + LOG(INFO) << model_dir_; + PD_ConfigEnableXpu(config, 0xfffc00); + bool use_xpu = PD_ConfigUseXpu(config); + EXPECT_TRUE(use_xpu); + int32_t device_id = PD_ConfigXpuDeviceId(config); + EXPECT_EQ(devive_id, 0); + PD_ConfigSwitchIrOptim(config, TRUE); + bool ir_optim = PD_IrOptim(config); + EXPECT_TRUE(ir_optim); + PD_ConfigEnableMemoryOptim(config); + bool memory_optim_enable = PD_ConfigMemoryOptimEnabled(config); + EXPECT_TRUE(memory_optim_enable); + PD_ConfigEnableProfile(config); + bool profiler_enable = PD_ConfigProfileEnabled(config); + EXPECT_TRUE(profiler_enable); + PD_SetInValid(config); + bool is_valid = PD_ConfigIsValid(config); + EXPECT_FALSE(is_valid); + PD_ConfigDestroy(config); +} +#endif + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc index 6d4bb70df6f3ad5a214543dacbc5d0d1864a67b1..9211ea246a5c5e0cdc75e6fef72ae0e4e40d69af 100644 --- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc +++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc @@ -75,14 +75,15 @@ int test_predictor_zero_copy(const AnalysisConfig& config_in, } std::vector input({1}); - auto in_tensor{predictor->GetInputTensor(predictor->GetInputNames().front())}; + auto in_tensor = + predictor->GetInputTensor(predictor->GetInputNames().front()); in_tensor->Reshape({1, 1}); in_tensor->copy_from_cpu(input.data()); predictor->ZeroCopyRun(); - auto out_tensor{ - predictor->GetOutputTensor(predictor->GetOutputNames().front())}; + auto out_tensor = + predictor->GetOutputTensor(predictor->GetOutputNames().front()); std::vector data_o(10); out_tensor->copy_to_cpu(data_o.data()); diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 377ea37677389923cabc71dfaf62fd2b11ab4f7c..2ea047fa13c10596995916234ef67e8a276b6b22 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -27,12 +27,18 @@ if (WITH_ROCM) cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator) endif() +if (WITH_ASCEND_CL) + cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info) +endif() + cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator) if (WITH_GPU OR WITH_ROCM) set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator) elseif(WITH_XPU) set(AllocatorFacadeDeps xpu_info) +elseif(WITH_ASCEND) + set(AllocatorFacadeDeps ascend_npu_info) else () set(AllocatorFacadeDeps) endif() diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index cbeb263b5f41b96c73d67d9f56a407eecf209815..730efa5c646885026eee1e472205ce723b0fcb1b 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -32,6 +32,7 @@ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/xpu_info.h" #endif +#include "paddle/fluid/platform/npu_info.h" DEFINE_int64( gpu_allocator_retry_time, 10000, @@ -66,6 +67,11 @@ class AllocatorFacadePrivate { InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id)); } InitNaiveBestFitCUDAPinnedAllocator(); +#endif +#ifdef PADDLE_WITH_ASCEND_CL + for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) { + InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id)); + } #endif break; } @@ -185,6 +191,12 @@ class AllocatorFacadePrivate { } #endif +#ifdef PADDLE_WITH_ASCEND_CL + void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) { + allocators_[p] = std::make_shared(p); + } +#endif + class ZeroSizeAllocator : public Allocator { public: explicit ZeroSizeAllocator(platform::Place place) : place_(place) {} diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc index c1b12f5c0ecbb6e4b367be0eb0ea9730b9f14ea6..b1a45afa99d9a565bfc3b8b3e6192eca7d2ccd05 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -54,6 +54,7 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) { size_t avail, total, actual_avail, actual_total; bool is_limited = platform::RecordedCudaMemGetInfo( &avail, &total, &actual_avail, &actual_total, place_.device); + size_t allocated = total - avail; std::string err_msg; if (is_limited) { @@ -68,13 +69,14 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) { PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( "\n\nOut of memory error on GPU %d. " - "Cannot allocate %s memory on GPU %d, " + "Cannot allocate %s memory on GPU %d, %s memory has been allocated and " "available memory is only %s.\n\n" "Please check whether there is any other process using GPU %d.\n" "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n" "2. If no, please decrease the batch size of your model. %s\n\n", place_.device, string::HumanReadableSize(size), place_.device, - string::HumanReadableSize(avail), place_.device, err_msg)); + string::HumanReadableSize(allocated), string::HumanReadableSize(avail), + place_.device, err_msg)); } } // namespace allocation diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 0ada2cafcc16a638cba2e8dbd8d36ce1b219d0b5..3e88d61783c9e67053ef065f61fef5cf991a9b25 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -19,7 +19,10 @@ #include "gflags/gflags.h" #include "glog/logging.h" #include "paddle/fluid/memory/detail/buddy_allocator.h" +#include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/npu_info.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/printf.h" @@ -110,6 +113,7 @@ size_t Used(const platform::CPUPlace &place) { return GetCPUBuddyAllocator()->Used(); } +// For kunlun XPU template <> void *Alloc(const platform::XPUPlace &place, size_t size) { #ifdef PADDLE_WITH_XPU @@ -219,6 +223,135 @@ size_t Used(const platform::XPUPlace &place) { #endif } +// For Ascend NPU +#ifdef PADDLE_WITH_ASCEND_CL +class NPUBuddyAllocatorList { + private: + NPUBuddyAllocatorList() : devices_(platform::GetSelectedNPUDevices()) { + auto npu_num = devices_.size(); + allocators_.resize(npu_num); + init_flags_.reserve(npu_num); + for (size_t i = 0; i < npu_num; ++i) { + init_flags_.emplace_back(new std::once_flag()); + } + } + + static NPUBuddyAllocatorList *CreateNewInstance() { + return new NPUBuddyAllocatorList(); + } + + public: + static NPUBuddyAllocatorList *Instance() { + static auto *instance = CreateNewInstance(); + return instance; + } + + BuddyAllocator *Get(int npu_id) { + auto pos = std::distance( + devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id)); + PADDLE_ENFORCE_LT(pos, devices_.size(), + platform::errors::OutOfRange( + "The index exceeds the size of devices, the size of " + "devices is %d, the index is %d", + devices_.size(), pos)); + + std::call_once(*init_flags_[pos], [this, pos] { + platform::SetNPUDeviceId(devices_[pos]); + allocators_[pos].reset(new BuddyAllocator( + std::unique_ptr( + new detail::NPUAllocator(devices_[pos])), + platform::NPUMinChunkSize(), platform::NPUMaxChunkSize())); + VLOG(10) << "\n\nNOTE:\n" + << "You can set GFlags environment variable " + << "'FLAGS_fraction_of_gpu_memory_to_use' " + << "or 'FLAGS_initial_gpu_memory_in_mb' " + << "or 'FLAGS_reallocate_gpu_memory_in_mb' " + << "to change the memory size for GPU usage.\n" + << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is " + << FLAGS_fraction_of_gpu_memory_to_use + << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is " + << FLAGS_initial_gpu_memory_in_mb + << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is " + << FLAGS_reallocate_gpu_memory_in_mb << "\n\n"; + }); + + return allocators_[pos].get(); + } + + private: + std::vector devices_; + std::vector> init_flags_; + std::vector> allocators_; +}; + +BuddyAllocator *GetNPUBuddyAllocator(int npu_id) { + return NPUBuddyAllocatorList::Instance()->Get(npu_id); +} +#endif + +template <> +size_t Used(const platform::NPUPlace &place) { +#ifdef PADDLE_WITH_ASCEND_CL + return GetNPUBuddyAllocator(place.device)->Used(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPlace' is not supported in CPU only device.")); +#endif +} + +template <> +void *Alloc(const platform::NPUPlace &place, size_t size) { +#ifdef PADDLE_WITH_ASCEND_CL + auto *buddy_allocator = GetNPUBuddyAllocator(place.device); + auto *ptr = buddy_allocator->Alloc(size); + if (ptr == nullptr) { + platform::NPUDeviceGuard(place.device); + size_t avail, total; + platform::NPUMemoryUsage(&avail, &total); + PADDLE_THROW(platform::errors::ResourceExhausted( + "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize " + "%s, GpuMaxChunkSize %s, GPU memory used: %s.", + string::HumanReadableSize(size), place.device, + string::HumanReadableSize(avail), string::HumanReadableSize(total), + string::HumanReadableSize(buddy_allocator->GetMinChunkSize()), + string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()), + string::HumanReadableSize(Used(place)))); + } else { + if (FLAGS_init_allocated_mem) { + aclrtMemset(ptr, size, 0xEF, size); + } + } + VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); + return ptr; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPlace' is not supported in CPU only device.")); +#endif +} + +template <> +void Free(const platform::NPUPlace &place, void *p, + size_t size) { +#ifdef PADDLE_WITH_ASCEND_CL + VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); + GetNPUBuddyAllocator(place.device)->Free(p); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPlace' is not supported in CPU only device.")); +#endif +} + +template <> +uint64_t Release(const platform::NPUPlace &place) { +#ifdef PADDLE_WITH_ASCEND_CL + return GetNPUBuddyAllocator(place.device)->Release(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPlace' is not supported in CPU only device.")); +#endif +} + +// For CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class GPUBuddyAllocatorList { private: diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc index 37da748ee9c965ab829851c7045ad6a1a1e0e93e..1fe85dd699acf18387482d296c2c30f3bb2415cb 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc @@ -61,6 +61,22 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) { } #endif +#ifdef PADDLE_WITH_ASCEND_CL +TEST(NaiveBestFitAllocatorTest, NpuAlloc) { + NaiveBestFitAllocator alloc{platform::NPUPlace(0)}; + { + size_t size = (1 << 20); + auto allocation = alloc.Allocate(size); + } + sleep(10); + alloc.Release(platform::NPUPlace(0)); + + size_t size = (1 << 20); + auto allocation = alloc.Allocate(size); + alloc.Release(platform::NPUPlace(0)); +} +#endif + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/npu_allocator.cc b/paddle/fluid/memory/allocation/npu_allocator.cc new file mode 100644 index 0000000000000000000000000000000000000000..faf7ae6221caaffeb3266b67c409b4bf61f476f0 --- /dev/null +++ b/paddle/fluid/memory/allocation/npu_allocator.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/npu_allocator.h" +#include +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/npu_info.h" + +namespace paddle { +namespace memory { +namespace allocation { + +bool NPUAllocator::IsAllocThreadSafe() const { return true; } +void NPUAllocator::FreeImpl(Allocation* allocation) { + PADDLE_ENFORCE_EQ( + BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_, + platform::errors::PermissionDenied( + "NPU memory is freed in incorrect device. This may be a bug")); + platform::RecordedNPUFree(allocation->ptr(), allocation->size(), + place_.device); + delete allocation; +} + +Allocation* NPUAllocator::AllocateImpl(size_t size) { + std::call_once(once_flag_, + [this] { platform::SetNPUDeviceId(place_.device); }); + + void* ptr; + auto result = platform::RecordedNPUMalloc(&ptr, size, place_.device); + if (LIKELY(result == ACL_ERROR_NONE)) { + return new Allocation(ptr, size, platform::Place(place_)); + } + + size_t avail, total, actual_avail, actual_total; + bool is_limited = platform::RecordedNPUMemGetInfo( + &avail, &total, &actual_avail, &actual_total, place_.device); + + std::string err_msg; + if (is_limited) { + auto limit_size = (total >> 20); + err_msg = string::Sprintf( + "Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger " + "value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum " + "GPU memory usage is limited to %d MB.\n" + " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", + limit_size, limit_size); + } + + PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( + "\n\nOut of memory error on NPU %d. " + "Cannot allocate %s memory on NPU %d, " + "available memory is only %s.\n\n" + "Please check whether there is any other process using NPU %d.\n" + "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n" + "2. If no, please decrease the batch size of your model. %s\n\n", + place_.device, string::HumanReadableSize(size), place_.device, + string::HumanReadableSize(avail), place_.device, err_msg)); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/npu_allocator.h b/paddle/fluid/memory/allocation/npu_allocator.h new file mode 100644 index 0000000000000000000000000000000000000000..bf668973505bab0b00b2da6111709e27236ffea6 --- /dev/null +++ b/paddle/fluid/memory/allocation/npu_allocator.h @@ -0,0 +1,41 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include // NOLINT +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class NPUAllocator : public Allocator { + public: + explicit NPUAllocator(const platform::NPUPlace& place) : place_(place) {} + + bool IsAllocThreadSafe() const override; + + protected: + void FreeImpl(Allocation* allocation) override; + Allocation* AllocateImpl(size_t size) override; + + private: + platform::NPUPlace place_; + std::once_flag once_flag_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt index fcae741db3667f4acf9ff33323f3f95710724669..e9631ee739b9b8089a963a6aa84a9837010ad639 100644 --- a/paddle/fluid/memory/detail/CMakeLists.txt +++ b/paddle/fluid/memory/detail/CMakeLists.txt @@ -6,6 +6,8 @@ if(WITH_GPU) nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place) elseif(WITH_ROCM) hip_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place) +elseif(${WITH_ASCEND_CL}) + cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info npu_info place) else() cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place) endif() diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index 50c0b58f3a1dd6eafd4ca86f2378cbd8f4b2e041..55436f451a41ff2a77acddfaff3c5a7c290b7ac2 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -21,6 +21,9 @@ limitations under the License. */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) DECLARE_uint64(reallocate_gpu_memory_in_mb); #endif +#ifdef PADDLE_WITH_ASCEND_CL +DECLARE_uint64(reallocate_gpu_memory_in_mb); +#endif namespace paddle { namespace memory { @@ -235,6 +238,21 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( } } #endif +#ifdef PADDLE_WITH_ASCEND_CL + if (system_allocator_->UseGpu()) { + if ((total_used_ + total_free_) == 0) { + // Compute the allocation size for gpu for the first allocation. + allocate_bytes = std::max(platform::NPUInitAllocSize(), request_bytes); + } else { + // Compute the re-allocation size, we store the re-allocation size when + // user set FLAGS_reallocate_gpu_memory_in_mb to fix value. + if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) { + realloc_size_ = platform::NPUReallocSize(); + } + allocate_bytes = std::max(realloc_size_, request_bytes); + } + } +#endif // Allocate a new block void* p = system_allocator_->Alloc(&index, allocate_bytes); diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h index 15e93deffccda8852b371a60ab3e08f9f8b811c2..135c3b6d04f346d361530ad5586e8f11e023d05c 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.h +++ b/paddle/fluid/memory/detail/buddy_allocator.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/npu_info.h" namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc index 2dc3e73af24162ebdc7872403fe28d83044920dc..290f3d5d1bcd47b40b8ee35ad45cd103bd11b26e 100644 --- a/paddle/fluid/memory/detail/buddy_allocator_test.cc +++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc @@ -19,14 +19,16 @@ limitations under the License. */ #ifdef WITH_GPERFTOOLS #include "gperftools/profiler.h" #endif +#include +#include + #include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/npu_info.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include -#include - +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_ASCEND_CL) DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_uint64(initial_gpu_memory_in_mb); DECLARE_uint64(reallocate_gpu_memory_in_mb); @@ -342,6 +344,32 @@ TEST(BuddyAllocator, Release) { } #endif +#ifdef PADDLE_WITH_ASCEND_CL +TEST(BuddyAllocator, NpuFraction) { + // In a 16 GB machine, the pool size will be about 160 MB + FLAGS_fraction_of_gpu_memory_to_use = 0.005; + FLAGS_fraction_of_gpu_memory_to_use = 0.92; + FLAGS_initial_gpu_memory_in_mb = 0; + FLAGS_reallocate_gpu_memory_in_mb = 0; + + BuddyAllocator buddy_allocator( + std::unique_ptr(new NPUAllocator(0)), + platform::NPUMinChunkSize(), platform::NPUMaxChunkSize()); + + // Less than pool size + TestBuddyAllocator(&buddy_allocator, 10); + TestBuddyAllocator(&buddy_allocator, 10 << 10); + TestBuddyAllocator(&buddy_allocator, 10 << 20); + buddy_allocator.Release(); + + // Greater than max chunk size + TestBuddyAllocator(&buddy_allocator, 300 << 20, + /* use_system_allocator = */ true); + TestBuddyAllocator(&buddy_allocator, 1 * static_cast(1 << 30), + /* use_system_allocator = */ true); +} +#endif + } // namespace detail } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 38baf6c24bab3fb7ca55a15b4f231bf9eba7d82e..0d7065d8bfba0e4ba6f443a3f9e87ee0e1a825a6 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -29,6 +29,8 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/npu_info.h" + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif @@ -123,6 +125,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { size_t avail, total, actual_avail, actual_total; bool is_limited = platform::RecordedCudaMemGetInfo( &avail, &total, &actual_avail, &actual_total, gpu_id_); + size_t allocated = total - avail; std::string err_msg; if (is_limited) { @@ -137,7 +140,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( "\n\nOut of memory error on GPU %d. " - "Cannot allocate %s memory on GPU %d, " + "Cannot allocate %s memory on GPU %d, %s memory has been allocated and " "available memory is only %s.\n\n" "Please check whether there is any other process using GPU %d.\n" "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n" @@ -148,8 +151,8 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { " The command is " "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n", gpu_id_, string::HumanReadableSize(size), gpu_id_, - string::HumanReadableSize(avail), gpu_id_, - FLAGS_fraction_of_gpu_memory_to_use, err_msg)); + string::HumanReadableSize(allocated), string::HumanReadableSize(avail), + gpu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg)); } } @@ -247,6 +250,68 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; } #endif +#ifdef PADDLE_WITH_ASCEND_CL +void* NPUAllocator::Alloc(size_t* index, size_t size) { + if (size <= 0) return nullptr; + + void* p; + auto result = platform::RecordedNPUMalloc(&p, size, npu_id_); + + if (result == ACL_ERROR_NONE) { + *index = 0; + npu_alloc_size_ += size; + return p; + } else { + size_t avail, total, actual_avail, actual_total; + bool is_limited = platform::RecordedNPUMemGetInfo( + &avail, &total, &actual_avail, &actual_total, npu_id_); + + std::string err_msg; + if (is_limited) { + auto limit_size = (total >> 20); + err_msg = string::Sprintf( + "\n 3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a " + "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the " + "maximum GPU memory usage is limited to %d MB.\n" + " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", + limit_size, limit_size); + } + + PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( + "\n\nOut of memory error on NPU %d. " + "Cannot allocate %s memory on NPU %d, " + "available memory is only %s.\n\n" + "Please check whether there is any other process using NPU %d.\n" + "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n" + "2. If no, please try one of the following suggestions:\n" + " 1) Decrease the batch size of your model.\n" + " 2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, " + "please set it to a higher value but less than 1.0.\n" + " The command is " + "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n", + npu_id_, string::HumanReadableSize(size), npu_id_, + string::HumanReadableSize(avail), npu_id_, + FLAGS_fraction_of_gpu_memory_to_use, err_msg)); + } +} + +void NPUAllocator::Free(void* p, size_t size, size_t index) { + VLOG(4) << "Free " << p << " size " << size; + PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( + "The index should be 0, index is %d", index)); + PADDLE_ENFORCE_GE(npu_alloc_size_, size, + platform::errors::InvalidArgument( + "The size of memory (%d) to free exceeds the size of " + "allocated gpu memory (%d)", + size, npu_alloc_size_)); + npu_alloc_size_ -= size; + + platform::RecordedNPUFree(p, size, npu_id_); +} + +bool NPUAllocator::UseGpu() const { return true; } +#endif + } // namespace detail } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h index e332bb670da2357f5ed831e743c20579677b90a5..26711ae4070f5ed72f77519b196c4c354cb049e1 100644 --- a/paddle/fluid/memory/detail/system_allocator.h +++ b/paddle/fluid/memory/detail/system_allocator.h @@ -66,6 +66,22 @@ class CUDAPinnedAllocator : public SystemAllocator { }; #endif +#ifdef PADDLE_WITH_ASCEND_CL + +class NPUAllocator : public SystemAllocator { + public: + explicit NPUAllocator(int npu_id) : npu_id_(npu_id) {} + + virtual void* Alloc(size_t* index, size_t size); + virtual void Free(void* p, size_t size, size_t index); + virtual bool UseGpu() const; + + private: + size_t npu_alloc_size_ = 0; + int npu_id_; +}; +#endif + } // namespace detail } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc index 13854d771a0bf60bfef90515795ee70d9cb7fb73..ead188341dac46bd3eec490015ff934dc8a26af5 100644 --- a/paddle/fluid/memory/detail/system_allocator_test.cc +++ b/paddle/fluid/memory/detail/system_allocator_test.cc @@ -85,3 +85,11 @@ TEST(GPUAllocator, AllocFailure) { } } #endif + +#ifdef PADDLE_WITH_ASCEND_CL +TEST(NPUAllocator, Alloc) { + paddle::memory::detail::NPUAllocator a(0); + TestAllocator(&a, 1 << 20); + TestAllocator(&a, 1); +} +#endif diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 7f871fab5a1470b32dfd44376b7cf30e1b94656a..730d49e8acd93022e6e46f7285b9548ed7a5c6d8 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -40,7 +40,7 @@ void Copy(platform::XPUPlace dst_place, platform::CPUPlace src_place, const void* src, size_t num) { if (num <= 0) { - VLOG(0) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")"; + VLOG(1) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")"; return; } int dev_id = -1; @@ -86,7 +86,7 @@ void Copy(platform::CPUPlace dst_place, platform::XPUPlace src_place, const void* src, size_t num) { if (num <= 0) { - VLOG(0) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")"; + VLOG(1) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")"; return; } int dev_id = -1; @@ -132,7 +132,7 @@ void Copy(platform::XPUPlace dst_place, platform::XPUPlace src_place, const void* src, size_t num) { if (num <= 0) { - VLOG(0) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")"; + VLOG(1) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")"; return; } int dev_id = -1; @@ -196,6 +196,106 @@ void Copy(platform::XPUPlace dst_place, } #endif +#ifdef PADDLE_WITH_ASCEND_CL +template <> +void Copy(platform::NPUPlace dst_place, + void* dst, + platform::CPUPlace src_place, + const void* src, size_t num, + aclrtStream stream) { + if (UNLIKELY(num == 0)) return; + + platform::SetNPUDeviceId(dst_place.device); + + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << " by thream(" << stream << ")"; + + if (stream) { + platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU"); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream); + } else { + // On NPU, async operation after sync operation is ok, while sync operation + // after async is not ok, since the async operation may not done. + // So, its needed to do wait before sync operation. + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + + platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU"); + platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE); + } +} + +template <> +void Copy(platform::CPUPlace dst_place, + void* dst, + platform::NPUPlace src_place, + const void* src, size_t num, + aclrtStream stream) { + if (UNLIKELY(num == 0)) return; + + platform::SetNPUDeviceId(src_place.device); + + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << " by thream(" << stream << ")"; + + if (stream) { + platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU"); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream); + } else { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + static_cast(pool.Get(src_place))->Wait(); + + platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU"); + platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST); + } +} + +template <> +void Copy(platform::NPUPlace dst_place, + void* dst, + platform::NPUPlace src_place, + const void* src, size_t num, + aclrtStream stream) { + if (UNLIKELY(num == 0)) return; + + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << " by stream(" << stream << ")"; + if (dst_place == src_place) { + platform::SetNPUDeviceId(src_place.device); + if (stream) { + platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU"); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, + stream); + } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + + platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU"); + platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); + } + } else { + if (!platform::NPUCanAccessPeer(dst_place.device, dst_place.device)) { + PADDLE_THROW(platform::errors::Unavailable( + "Peer access between NPU places is not allowed.")); + } + if (stream) { + // TODO(zhiqiu): support peer access? + platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU"); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, + stream); + } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + + platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU"); + platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); + } + } +} +#endif + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h index 25490f28b659876ddd1e9e0eef7f23062791a05e..c630437224cd093438df5d8d8a58a5c8f6ab2ad2 100644 --- a/paddle/fluid/memory/memcpy.h +++ b/paddle/fluid/memory/memcpy.h @@ -52,7 +52,27 @@ void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num); template void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, gpuStream_t stream); +#endif +#ifdef PADDLE_WITH_ASCEND_CL +/** + * \brief Copy memory from one place to another place. + * + * \param[in] DstPlace Destination allocation place (CPU or NPU). + * \param[in] dst Destination memory address. + * \param[in] SrcPlace Source allocation place (CPU or NPU). + * \param[in] src Source memory address. + * \param[in] num memory size in bytes to copy. + * \param[in] stream NPU stream. + * + * \note For NPU memory copy, NPU stream need to be specified + * for asynchronously memory copy. + * + */ +template +void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, + aclrtStream stream); #endif + } // namespace memory } // namespace paddle diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 467a5ff9063a65bd7905edd0b9818aa600d595bf..6e11c64afc4bd813362640e151203d4dd700fea5 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -10,6 +10,7 @@ file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operators/CMakeLists copy_if_different(${pybind_file} ${pybind_file_final}) add_subdirectory(math) +add_subdirectory(eigen) add_subdirectory(controlflow) add_subdirectory(detection) add_subdirectory(elementwise) @@ -41,6 +42,10 @@ if (WITH_GPU AND TENSORRT_FOUND) add_subdirectory(tensorrt) endif() +if (WITH_DLNNE) + add_subdirectory(dlnne) +endif() + if (WITH_LITE) add_subdirectory(lite) endif() @@ -68,7 +73,7 @@ if(WITH_UNITY_BUILD) include(unity_build_rule.cmake) endif() -register_operators(EXCLUDES py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op +register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS}) @@ -110,8 +115,9 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_fun set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost ps_gpu_wrapper) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_cc_function) if (WITH_GPU OR WITH_ROCM) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor eigen_cu_function) endif() set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer) @@ -121,6 +127,12 @@ if (WITH_ASCEND) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} ascend_wrapper) endif() +if (WITH_ASCEND_CL) + cc_test(assign_op_npu_test SRCS assign_op_npu_test.cc DEPS assign_op) + cc_library(npu_op_runner SRCS npu_op_runner.cc DEPS operator npu_info) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} npu_op_runner) +endif() + # FIXME(typhoonzero): operator deps may not needed. # op_library(lod_tensor_to_array_op DEPS lod_rank_table_op) # op_library(array_to_lod_tensor_op DEPS lod_rank_table_op) @@ -134,8 +146,8 @@ set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS}) set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies") cc_test(test_common_infer_shape_functions SRCS test_common_infer_shape_functions.cc DEPS common_infer_shape_functions ${COMMON_OP_DEPS} activation_op elementwise_add_op softmax_op softmax) -cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op) cc_test(gather_test SRCS gather_test.cc DEPS tensor) +cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) @@ -154,12 +166,22 @@ endif() cc_library(tensor_formatter SRCS tensor_formatter.cc DEPS ${OP_HEADER_DEPS}) if (WITH_PYTHON) cc_library(py_func_op SRCS py_func_op.cc DEPS op_registry python pybind) + cc_library(py_layer_op SRCS py_layer_op.cc DEPS op_registry python pybind) +endif() + +if (WITH_ASCEND_CL) + cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor) + cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op eigen_cc_function scope device_context enforce executor compare_op) endif() set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") add_subdirectory(benchmark) cc_test(op_debug_string_test SRCS op_debug_string_test.cc DEPS elementwise_add_op) +if (WITH_ASCEND_CL) + cc_test(transpose_op_npu_test SRCS transpose_op_npu_test.cc DEPS op_registry transpose_op scope device_context enforce executor) +endif() + if(WITH_MKLDNN) include(mkldnn/inplace_op_tests.cmake) @@ -173,3 +195,11 @@ if(WITH_UNITY_BUILD) # The specified link dependency needs to be displayed here. target_link_libraries(paddle_operators_unity ${OP_HEADER_DEPS} ${COMMON_OP_DEPS}) endif() + +if(WITH_ASCEND_CL) +cc_test(gelu_op_npu_test SRCS gelu_op_npu_test.cc DEPS op_registry gelu_op scope device_context enforce executor) +endif() + +if (WITH_GPU OR WITH_ASCEND_CL) +cc_test(copy_cross_scope_test SRCS copy_cross_scope_test.cc DEPS op_registry copy_cross_scope_op scope device_context enforce executor) +endif() diff --git a/paddle/fluid/operators/abs_op.cu b/paddle/fluid/operators/abs_op.cu index e373d628f6cbd6b5ee48edc984a68d2767ce0593..97409e6cb1b17b8fc109e30dc78720b8d573f042 100644 --- a/paddle/fluid/operators/abs_op.cu +++ b/paddle/fluid/operators/abs_op.cu @@ -13,44 +13,79 @@ // limitations under the License. #include "paddle/fluid/operators/abs_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/platform/complex128.h" #include "paddle/fluid/platform/complex64.h" #include "paddle/fluid/platform/float16.h" +namespace paddle { +namespace operators { + +template +struct CudaAbsFunctor; + +template +struct CudaAbsFunctor>> { + __device__ __forceinline__ math::Real operator()(const T* args) const { + return abs(args[0]); + } +}; + +template +struct CudaAbsFunctor>> { + __device__ __forceinline__ T operator()(const T* args) const { + return std::abs(args[0]); + } +}; + +template +class AbsKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* x = context.Input("X"); + Tensor* out = context.Output("Out"); + out->mutable_data>(context.GetPlace()); + + auto& dev_ctx = + context.template device_context(); + std::vector ins = {x}; + std::vector outs = {out}; + auto functor = CudaAbsFunctor(); + LaunchElementwiseCudaKernel>( + dev_ctx, ins, &outs, functor); + } +}; + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; +namespace plat = paddle::platform; + REGISTER_OP_CUDA_KERNEL( - abs, ops::AbsKernel, - ops::AbsKernel, - ops::AbsKernel, - ops::AbsKernel, - ops::AbsKernel, - ops::AbsKernel, - ops::AbsKernel); + abs, ops::AbsKernel, + ops::AbsKernel, + ops::AbsKernel, + ops::AbsKernel, + ops::AbsKernel, + ops::AbsKernel, + ops::AbsKernel); REGISTER_OP_CUDA_KERNEL( - abs_grad, ops::AbsGradKernel, - ops::AbsGradKernel, - ops::AbsGradKernel, - ops::AbsGradKernel, - ops::AbsGradKernel, - ops::AbsGradKernel, - ops::AbsGradKernel); + abs_grad, ops::AbsGradKernel, + ops::AbsGradKernel, + ops::AbsGradKernel, + ops::AbsGradKernel, + ops::AbsGradKernel, + ops::AbsGradKernel, + ops::AbsGradKernel); REGISTER_OP_CUDA_KERNEL( - abs_grad_grad, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel); + abs_grad_grad, ops::AbsDoubleGradKernel, + ops::AbsDoubleGradKernel, + ops::AbsDoubleGradKernel, + ops::AbsDoubleGradKernel, + ops::AbsDoubleGradKernel, + ops::AbsDoubleGradKernel, + ops::AbsDoubleGradKernel); diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 94f2eb3672bd5d06f4a3f310cdad39119c336a0f..055909ba6f486ff82220c2d36c54687091bde9ed 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -162,6 +162,12 @@ $$out = \\frac{1}{1 + e^{-x}}$$ )DOC"; +UNUSED constexpr char SiluDoc[] = R"DOC( +Silu Activation Operator + +$$out = x * \\frac{1}{1 + e^{-x}}$$ +)DOC"; + UNUSED constexpr char LogSigmoidDoc[] = R"DOC( Logsigmoid Activation Operator @@ -697,6 +703,7 @@ It is recommended to use the defaults for this activation. }; REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc); +REGISTER_ACTIVATION_OP_MAKER(Silu, SiluDoc); REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc); REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc); REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc); @@ -782,6 +789,26 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel { } }; +template +class TanhDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { + public: + using ::paddle::framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("tanh_grad_grad"); + // input1: Out + op->SetInput("Out", this->Input("Out")); + // input2: ddx + op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X"))); + op->SetInput("DOut", this->Input(framework::GradVarName("Out"))); + op->SetAttrMap(this->Attrs()); + // output: ddy + op->SetOutput("DOutNew", this->InputGrad("Out")); + op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out"))); + } +}; + // ReluGrad: dx = dy if y >= 0 else 0 // ReluGradGrad: ddy = ddx if y >= 0 else 0 template @@ -1041,6 +1068,34 @@ namespace plat = paddle::platform; FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP); FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL); +/* ========================== tanh register ============================= */ +REGISTER_OPERATOR( + tanh, ops::ActivationOp, ops::TanhOpMaker, ops::ActivationOpInferVarType, + ops::ActivationGradOpMaker::FwdDeps(), + paddle::framework::OpDesc>, + ops::ActivationGradOpMaker::FwdDeps(), + paddle::imperative::OpBase>, + std::conditional>(), + ops::ActFwdInplaceInferer, void>::type); +REGISTER_OPERATOR(tanh_grad, ops::ActivationOpGrad, + ops::ActivationGradOpInplaceInferer, + ops::TanhDoubleGradMaker, + ops::TanhDoubleGradMaker) +REGISTER_OPERATOR( + tanh_grad_grad, + ops::ActivationOpDoubleGrad::FwdDeps()>, + ops::ActivationDoubleGradOpInplaceInferer); + +REGISTER_ACTIVATION_CPU_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor); +REGISTER_OP_CPU_KERNEL( + tanh_grad_grad, ops::TanhDoubleGradKernel>, + ops::TanhDoubleGradKernel>, + ops::TanhDoubleGradKernel>); +/* ========================================================================== */ + /* ========================== relu register ============================= */ REGISTER_OPERATOR( relu, ops::ActivationOp, ops::ReluOpMaker, ops::ActivationOpInferVarType, diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 2033081af224a4e938a7b4f0f619729feea57506..618f17031b1ef3b4b96ea72b05f9f63edd01c794 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -10,32 +10,1409 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" +#include "paddle/fluid/operators/math/math_cuda_utils.h" +#include "paddle/fluid/platform/cuda_device_function.h" + +namespace paddle { +namespace operators { + +template +struct CudaReluFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + + // relu(x) = max(x, 0) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] > zero ? args[0] : zero; + } +}; + +template +struct CudaReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + + // dx = dout * (out > 0) + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + return args[1] > zero ? args[0] : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudaLeakyReluFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // leakyrelu(x) = x > 0 ? x : alpha * x + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] > zero ? args[0] : static_cast(alpha) * args[0]; + } +}; + +template +struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // dx = dout * (x > 0 ? 1 : alpha) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[1] > zero ? args[0] : static_cast(alpha) * args[0]; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaSigmoidFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // sigmoid(x) = 1 / (1 + exp(-x)) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(one / (one + exp(-x))); + } +}; + +template +struct CudaSigmoidGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout * out * (1 - out) + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] * args[1] * (one - args[1]); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudaSiluFunctor : public BaseActivationFunctor { + // MPType means Compute Type + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // silu(x) = x / (1 + exp(-x)) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(x / (one + exp(-x))); + } +}; + +template +struct CudaSiluGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + MPType temp = one / (one + exp(-x)); + return static_cast(dout * (temp * (one + x * (one - temp)))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaLogSigmoidFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + + // logsigmoid(x) = log(1 / (1 + exp(-x))) + // For numerical stability, + // logsigmoid(x) = + // - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + MPType temp = x > zero ? zero : -x; + return static_cast(-temp - log(exp(-temp) + exp(-x - temp))); + } +}; + +template +struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + + // dx = dout * exp(-x) / (1 + exp(-x)) + // For numerical stability: + // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x, + // 0))) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + MPType temp1 = x > zero ? zero : -x; + MPType temp2 = exp(-x - temp1); + return static_cast(dout * (temp2 / (exp(-temp1) + temp2))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaAtanFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // atan(x) = atan(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(atan(x)); + } +}; + +template +struct CudaAtanGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout / (1 + x^2) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] / (one + args[1] * args[1]); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaSoftShrinkFunctor : public BaseActivationFunctor { + float lambda; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + + // softshrink(x) = x - lambda, if x > lambda; + // x + lambda, if x < -lambda; + // 0, otherwise. + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T x = args[0]; + T l = static_cast(lambda); + T temp1 = static_cast(x > l); + T temp2 = static_cast(x < -l); + return temp1 * (x - l) + temp2 * (x + l); + } +}; + +template +struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float lambda; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + + // dx = dout, if x > lambda or x < -lambda else 0 + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T x = args[1]; + T l = static_cast(lambda); + return (x >= -l && x <= l) ? zero : args[0]; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaCeilFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // ceil(x) = ceil(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(ceil(x)); + } +}; + +template +struct CudaFloorFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // floor(x) = floor(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(floor(x)); + } +}; + +template +struct CudaRoundFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // round(x) = round(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(round(x)); + } +}; + +// grad functor for ceil, floor and round +template +struct CudaZeroGradFunctor : public BaseActivationFunctor { + __device__ __forceinline__ T operator()(const T* args) const { + return static_cast(0.0f); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; } +}; + +template +struct CudaCosFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // cos(x) = cos(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(cos(x)); + } +}; + +template +struct CudaCosGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // dx = dout * (-sin(x)) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(-dout * sin(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaSinFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // sin(x) = sin(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(sin(x)); + } +}; + +template +struct CudaSinGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // dx = dout * cos(x) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(dout * cos(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaTanFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // tan(x) = tan(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(tan(x)); + } +}; + +template +struct CudaTanGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // dx = dout / cos(x)^2 + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(dout / (cos(x) * cos(x))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaAsinFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // asin(x) = asin(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(asin(x)); + } +}; + +template +struct CudaAsinGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // dx = dout / sqrt(1 - x^2) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(dout / sqrt(one - x * x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaAcosFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // acos(x) = acos(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(acos(x)); + } +}; + +template +struct CudaAcosGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // dx = -dout / sqrt(1 - x^2) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(-dout / sqrt(one - x * x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaCoshFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // cosh(x) = cosh(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(cosh(x)); + } +}; + +template +struct CudaCoshGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // dx = dout * sinh(x) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(dout * sinh(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaSinhFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // sinh(x) = sinh(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(sinh(x)); + } +}; + +template +struct CudaSinhGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // dx = dout * cosh(x) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(dout * cosh(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaTanhFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // tanh(x) = tanh(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(tanh(x)); + } +}; + +template +struct CudaTanhGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout * (1 - out^2) + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + T dout = static_cast(args[0]); + T out = static_cast(args[1]); + return dout * (one - out * out); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudaReciprocalFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // reciprocal(x) = 1 / x + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return one / args[0]; + } +}; + +template +struct CudaReciprocalGradFunctor : public BaseActivationFunctor { + // dx = -dout * out^2 + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + return -args[0] * args[1] * args[1]; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudaExpFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // exp(x) = exp(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(exp(x)); + } +}; + +template +struct CudaExpGradFunctor : public BaseActivationFunctor { + // dx = dout * out + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] * args[1]; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudaLogFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // log(x) = log(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(log(x)); + } +}; + +template +struct CudaLogGradFunctor : public BaseActivationFunctor { + // dx = dout / x + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] / args[1]; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaSquareFunctor : public BaseActivationFunctor { + // square(x) = x * x + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] * args[0]; + } +}; + +template +struct CudaSquareGradFunctor : public BaseActivationFunctor { + T two = static_cast(2.0f); + + // dx = dout * 2 * x + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] * two * args[1]; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaSqrtFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // sqrt(x) = sqrt(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(sqrt(x)); + } +}; + +template +struct CudaSqrtGradFunctor : public BaseActivationFunctor { + T one_half = static_cast(0.5f); + + // dx = dout * 0.5 / out + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + return one_half * args[0] / args[1]; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudaRsqrtFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // rsqrt(x) = rsqrt(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(rsqrt(x)); + } +}; + +template +struct CudaRsqrtGradFunctor : public BaseActivationFunctor { + T minus_one_half = static_cast(-0.5f); + + // dx = dout * -0.5 / out^3 + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + T out = args[1]; + return minus_one_half * args[0] * out * out * out; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudaLog1pFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // log1p(x) = log(1 + x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(log(one + x)); + } +}; + +template +struct CudaLog1pGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout / (1 + x) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] / (one + args[1]); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaLog2Functor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // log2(x) = log2(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(log2(x)); + } +}; + +template +struct CudaLog2GradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + T log_two = static_cast(log(static_cast(2.0f))); + + // dx = dout / (x * log(2)) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] / (args[1] * log_two); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaLog10Functor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // log10(x) = log10(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(log10(x)); + } +}; + +template +struct CudaLog10GradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + T log_ten = static_cast(log(static_cast(10.0f))); + + // dx = dout / (x * log(10)) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] / (args[1] * log_ten); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaBReluFunctor : public BaseActivationFunctor { + float t_min; + float t_max; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + + // brelu(x) = min(max(x, t_min), t_max) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T x = args[0]; + T t_min_cast = static_cast(t_min); + T t_max_cast = static_cast(t_max); + T temp_max = x > t_min_cast ? x : t_min_cast; + T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast; + return temp_min; + } +}; + +template +struct CudaBReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float t_min; + float t_max; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + + // dx = (x > t_min && x < t_max) ? dout : 0 + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T dout = args[0]; + T x = args[1]; + T t_min_cast = static_cast(t_min); + T t_max_cast = static_cast(t_max); + return (x > t_min_cast && x < t_max_cast) ? dout : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaSoftReluFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // soft_relu(x) = log(1 + exp(max(min(x, threshold), -threshold))) + // Inputs: args[0], the input x + // threshold should not be negative + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + MPType t = static_cast(threshold); + MPType temp_min = x < t ? x : t; + MPType temp_max = temp_min > -t ? temp_min : -t; + return static_cast(log(one + exp(temp_max))); + } +}; + +template +struct CudaSoftReluGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // dx = (out > -threshold && out < threshold) ? dout * (1 - exp(-out)) : 0 + // Inputs: args[0], the input dout + // args[1], the input out + // threshold should not be negative + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType out = static_cast(args[1]); + MPType t = static_cast(threshold); + return (out > -t && out < t) ? static_cast(dout * (one - exp(-out))) + : static_cast(0.0f); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudaSTanhFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + float scale_a; + float scale_b; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"scale_a", &scale_a}, {"scale_b", &scale_b}}; + } + + // stanh(x) = b * tanh(a * x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + MPType a = static_cast(scale_a); + MPType b = static_cast(scale_b); + return static_cast(b * tanh(a * x)); + } +}; + +template +struct CudaSTanhGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + float scale_a; + float scale_b; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"scale_a", &scale_a}, {"scale_b", &scale_b}}; + } + + // dx = dout * a * b * (1 - tanh(a * x) * tanh(a * x)) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + MPType a = static_cast(scale_a); + MPType b = static_cast(scale_b); + MPType temp = tanh(a * x); + return static_cast(dout * a * b * (one - temp * temp)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaSoftplusFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + float beta; + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"beta", &beta}, {"threshold", &threshold}}; + } + + // softplus(x) = beta * x > threshold ? x : log(1 + exp(beta * x)) / beta + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + MPType b = static_cast(beta); + MPType t = static_cast(threshold); + MPType x_beta = x * beta; + return static_cast(x_beta > t ? x : log(one + exp(x_beta)) / b); + } +}; + +template +struct CudaSoftplusGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + float beta; + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"beta", &beta}, {"threshold", &threshold}}; + } + + // dx = x * beta > threshold ? dout : dout / (1 + exp(-beta * x)) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + MPType b = static_cast(beta); + MPType t = static_cast(threshold); + MPType x_beta = x * beta; + return x_beta > t ? args[0] : static_cast(dout / (one + exp(-x_beta))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaSoftsignFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // softsign(x) = x / (1 + abs(x)) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] / (one + abs(args[0])); + } +}; + +template +struct CudaSoftsignGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout / (1 + abs(x))^2 + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T temp = one + abs(args[1]); + return args[0] / (temp * temp); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaRelu6Functor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // relu6(x) = min(max(0, x), 6) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T t = static_cast(threshold); + return args[0] <= zero ? zero : (args[0] < t ? args[0] : t); + } +}; + +template +struct CudaRelu6GradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // dx = (out > 0 && out < t) ? dout : 0 + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + T t = static_cast(threshold); + return (args[1] > zero && args[1] < t) ? args[0] : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudaTanhShrinkFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // tanhshrink(x) = x - tanh(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(x - tanh(x)); + } +}; + +template +struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // dx = dout * tanh(x)^2 + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(dout * tanh(x) * tanh(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaHardShrinkFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T x = args[0]; + T t = static_cast(threshold); + return (x > -t && x < t) ? zero : x; + } +}; + +template +struct CudaHardShrinkGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // dx = (x > -threshold && x < threshold) ? 0 : dout + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T x = args[1]; + T t = static_cast(threshold); + return (x > -t && x < t) ? zero : args[0]; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaHardSigmoidFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + T one = static_cast(1.0f); + float slope; + float offset; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + // hard_sigmoid(x) = 0, when x <= -3 + // 1, when x >= 3 + // x * slope + offset, otherwise + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T temp = args[0] * static_cast(slope) + static_cast(offset); + T temp_max = temp > zero ? temp : zero; + T temp_min = temp_max < one ? temp_max : one; + return temp_min; + } +}; + +template +struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + T one = static_cast(1.0f); + float slope; + float offset; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + // dx = (out > 0 && out < 1) ? dout * slope : 0 + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + T out = args[1]; + return (out > zero && out < one) ? args[0] * static_cast(slope) : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudaSwishFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + float beta; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"beta", &beta}}; + } + + // swish(x) = x / (1 + exp(-beta * x)) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + MPType b = static_cast(beta); + return static_cast(x / (one + exp(-b * x))); + } +}; + +template +struct CudaSwishGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + float beta; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"beta", &beta}}; + } + + // dx = dout * (1 + exp(-b * x) + b * x * exp(-b * x) / (1 + exp(-b * x))^2) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + MPType b = static_cast(beta); + MPType temp1 = one / (one + exp(-b * x)); + MPType out = x * temp1; + MPType temp2 = b * out; + MPType temp3 = temp1 * (one - temp2); + return static_cast(dout * (temp2 + temp3)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaThresholdedReluFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // thresholded_relu(x) = x > threshold ? x : 0 + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] > static_cast(threshold) ? args[0] : zero; + } +}; + +template +struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // dx = x > threshold ? dout : 0 + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[1] > static_cast(threshold) ? args[0] : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaHardSwishFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + float scale; + float offset; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}, {"scale", &scale}, {"offset", &offset}}; + } + + // hard_swish(x) = 0, when x <= -offset + // x , when x >= threshold - offset + // x * (x + offset) / scale, otherwise + // threshold = scale = 6, offset = 3 by default + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T x = args[0]; + T t = static_cast(threshold); + T temp = x + static_cast(offset); + T temp_max = temp > zero ? temp : zero; + T temp_min = temp_max < t ? temp_max : t; + return temp_min * x / static_cast(scale); + } +}; + +template +struct CudaHardSwishGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + T one = static_cast(1.0f); + T two = static_cast(2.0f); + float threshold; + float scale; + float offset; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}, {"scale", &scale}, {"offset", &offset}}; + } + + // dx = 0, when x <= -offset + // dout , when x >= threshold - offset + // dout * (2 * x / scale + offset / scale), otherwise + // threshold = scale = 6, offset = 3 by default + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T x = args[1]; + T o = static_cast(offset); + T s = static_cast(scale); + T temp1 = static_cast(x + o > zero); + T temp2 = static_cast(x + o < static_cast(threshold)); + return args[0] * (temp1 * temp2 * (two * x + o) / s + one - temp2); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaELUFunctor : public BaseActivationFunctor { + using CT = typename details::MPTypeTrait::Type; + CT zero = static_cast(0.0f); + CT one = static_cast(1.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // elu(x) = max(0, x) + min(0, alpha * (exp(x) - 1)) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + CT x = static_cast(args[0]); + CT temp = static_cast(alpha) * (exp(x) - one); + CT res = (x > zero ? x : zero) + (temp > zero ? zero : temp); + return static_cast(res); + } +}; + +template +struct CudaELUGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + MPType one = static_cast(1.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // dx = dout, if alpha > 0 and x > 0 + // dx = dout * alpha * x.exp(), if alpha > 0 and x <= 0 + // dx = dout * (1 + alpha * x.exp()), if alpha <= 0 and x > 0 + // dx = 0, if alpha <= 0 and x <=0 + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + MPType a = static_cast(alpha); + MPType temp_a_pos = static_cast(alpha > 0.0f); + MPType temp_a_neg = static_cast(alpha <= 0.0f); + MPType temp_x_pos = static_cast(x > zero); + MPType temp_x_neg = static_cast(x <= zero); + return static_cast( + dout * (temp_a_pos * temp_x_pos + temp_a_pos * temp_x_neg * a * exp(x) + + temp_a_neg * temp_x_pos * (one + a * exp(x)))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +class ActivationCudaKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::Tensor* x = nullptr; + framework::Tensor* out = nullptr; + ExtractActivationTensor(ctx, &x, &out); + out->mutable_data(ctx.GetPlace()); + auto& dev_ctx = ctx.template device_context(); + std::vector ins = {x}; + std::vector outs = {out}; + auto functor = Functor(); + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = ctx.Attr(attr.first); + } + LaunchElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); + } +}; + +template +class ActivationGradCudaKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::Tensor *x, *out, *d_out; + framework::Tensor* d_x = nullptr; + x = out = d_out = nullptr; + ExtractActivationGradTensor(ctx, &x, &out, &d_out, + &d_x); + d_x->mutable_data(ctx.GetPlace()); + auto& dev_ctx = ctx.template device_context(); + auto functor = Functor(); + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = ctx.Attr(attr.first); + } + + std::vector ins = {d_out}; + std::vector outs = {d_x}; + + if (static_cast(Functor::FwdDeps()) == static_cast(kDepOut)) { + // Only need forward output Out + ins.push_back(out); + LaunchElementwiseCudaKernel( + dev_ctx, ins, &outs, functor); + } else if (static_cast(Functor::FwdDeps()) == + static_cast(kDepX)) { + // Only need forward input X + ins.push_back(x); + LaunchElementwiseCudaKernel( + dev_ctx, ins, &outs, functor); + } else { + LaunchElementwiseCudaKernel( + dev_ctx, ins, &outs, functor); + } + } +}; + +} // namespace operators +} // namespace paddle namespace ops = paddle::operators; namespace plat = paddle::platform; -#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, op_name, functor, \ - grad_functor) \ - REGISTER_OP_CUDA_KERNEL( \ - act_type, \ - ops::ActivationKernel>, \ - ops::ActivationKernel>, \ - ops::ActivationKernel>); \ - REGISTER_OP_CUDA_KERNEL( \ - act_type##_grad, ops::ActivationGradKernel>, \ - ops::ActivationGradKernel>, \ - ops::ActivationGradKernel>); - -FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CUDA_KERNEL); +#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, op_name, functor, \ + grad_functor) \ + REGISTER_OP_CUDA_KERNEL( \ + act_type, ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>); \ + REGISTER_OP_CUDA_KERNEL( \ + act_type##_grad, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>); + +#define REGISTER_ACTIVATION_CUDA_KERNEL_INT(act_type, op_name, functor, \ + grad_functor) \ + REGISTER_OP_CUDA_KERNEL( \ + act_type, ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>); \ + REGISTER_OP_CUDA_KERNEL( \ + act_type##_grad, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>); /* ======================== leaky relu register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor, - LeakyReluGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, + CudaLeakyReluGradFunctor); REGISTER_OP_CUDA_KERNEL( leaky_relu_grad_grad, @@ -48,7 +1425,7 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ /* ======================== elu register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(elu, ELU, ELUFunctor, ELUGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(elu, ELU, CudaELUFunctor, CudaELUGradFunctor); REGISTER_OP_CUDA_KERNEL( elu_grad_grad, ops::ELUDoubleGradKernel>); /* ========================================================================== */ +/* =========================== tanh register ============================ */ +REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor, + CudaTanhGradFunctor); + +REGISTER_OP_CUDA_KERNEL( + tanh_grad_grad, + ops::TanhDoubleGradKernel>, + ops::TanhDoubleGradKernel>, + ops::TanhDoubleGradKernel>); +/* ========================================================================== */ + /* =========================== sqrt register ============================= */ -REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor, + CudaSqrtGradFunctor); REGISTER_OP_CUDA_KERNEL( sqrt_grad_grad, @@ -87,7 +1480,8 @@ REGISTER_OP_CUDA_KERNEL( /* =========================== rsqrt register ============================= */ -REGISTER_ACTIVATION_CUDA_KERNEL(rsqrt, Rsqrt, RsqrtFunctor, RsqrtGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(rsqrt, Rsqrt, CudaRsqrtFunctor, + CudaRsqrtGradFunctor); REGISTER_OP_CUDA_KERNEL( rsqrt_grad_grad, @@ -100,25 +1494,8 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ /* =========================== square register ============================ */ -REGISTER_OP_CUDA_KERNEL( - square, - ops::ActivationKernel>, - ops::ActivationKernel>, - ops::ActivationKernel>, - ops::ActivationKernel>, - ops::ActivationKernel>); -REGISTER_OP_CUDA_KERNEL( - square_grad, ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>); +REGISTER_ACTIVATION_CUDA_KERNEL_INT(square, Square, CudaSquareFunctor, + CudaSquareGradFunctor); REGISTER_OP_CUDA_KERNEL( square_grad_grad, @@ -135,7 +1512,6 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ /* ========================== pow register ============================ */ - REGISTER_OP_CUDA_KERNEL( pow, ops::PowKernel>, ops::PowKernel>, @@ -153,29 +1529,30 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ /* ========================== exp register ============================ */ - REGISTER_OP_CUDA_KERNEL( - exp, ops::ActivationKernel>, - ops::ActivationKernel>, + exp, ops::ActivationCudaKernel>, + ops::ActivationCudaKernel>, ops::ActivationKernel>, ops::ActivationKernel>, - ops::ActivationKernel>); + ops::ActivationCudaKernel>); REGISTER_OP_CUDA_KERNEL( - exp_grad, ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>); + exp_grad, ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>); /* ========================================================================== */ /* ========================== Log register ==================================*/ -REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, LogFunctor, LogGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor); REGISTER_OP_CUDA_KERNEL( log_grad_grad, ops::LogDoubleGradKernel>); /* ========================================================================== */ + +#define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \ + __macro(sigmoid, Sigmoid, CudaSigmoidFunctor, CudaSigmoidGradFunctor); \ + __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor); \ + __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, \ + CudaLogSigmoidGradFunctor); \ + __macro(atan, Atan, CudaAtanFunctor, CudaAtanGradFunctor); \ + __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor, \ + CudaSoftShrinkGradFunctor); \ + __macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor); \ + __macro(floor, Floor, CudaFloorFunctor, CudaZeroGradFunctor); \ + __macro(cos, Cos, CudaCosFunctor, CudaCosGradFunctor); \ + __macro(tan, Tan, CudaTanFunctor, CudaTanGradFunctor); \ + __macro(acos, Acos, CudaAcosFunctor, CudaAcosGradFunctor); \ + __macro(sin, Sin, CudaSinFunctor, CudaSinGradFunctor); \ + __macro(asin, Asin, CudaAsinFunctor, CudaAsinGradFunctor); \ + __macro(sinh, Sinh, CudaSinhFunctor, CudaSinhGradFunctor); \ + __macro(cosh, Cosh, CudaCoshFunctor, CudaCoshGradFunctor); \ + __macro(round, Round, CudaRoundFunctor, CudaZeroGradFunctor); \ + __macro(reciprocal, Reciprocal, CudaReciprocalFunctor, \ + CudaReciprocalGradFunctor); \ + __macro(log1p, Log1p, CudaLog1pFunctor, CudaLog1pGradFunctor); \ + __macro(log2, Log2, CudaLog2Functor, CudaLog2GradFunctor); \ + __macro(log10, Log10, CudaLog10Functor, CudaLog10GradFunctor); \ + __macro(brelu, BRelu, CudaBReluFunctor, CudaBReluGradFunctor); \ + __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \ + __macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor); \ + __macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor); \ + __macro(softsign, Softsign, CudaSoftsignFunctor, CudaSoftsignGradFunctor); \ + __macro(relu6, Relu6, CudaRelu6Functor, CudaRelu6GradFunctor); \ + __macro(tanh_shrink, TanhShrink, CudaTanhShrinkFunctor, \ + CudaTanhShrinkGradFunctor); \ + __macro(hard_shrink, HardShrink, CudaHardShrinkFunctor, \ + CudaHardShrinkGradFunctor); \ + __macro(hard_sigmoid, HardSigmoid, CudaHardSigmoidFunctor, \ + CudaHardSigmoidGradFunctor); \ + __macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor); \ + __macro(thresholded_relu, ThresholdedRelu, CudaThresholdedReluFunctor, \ + CudaThresholdedReluGradFunctor); \ + __macro(hard_swish, HardSwish, CudaHardSwishFunctor, \ + CudaHardSwishGradFunctor); +FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL) diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index bc7def61b2e249d3fa5e8b1d915b86f50beaf77b..ccd5bf528ba58ca731513a1a1fafce3f2f64c470 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -258,6 +258,31 @@ struct SigmoidGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; +// silu(x) = x / (1 + exp(-x)) +template +struct SiluFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + auto temp = static_cast(1) / (static_cast(1) + (-x).exp()); + out.device(d) = x * temp; + } +}; + +// silu'(x) = (1 / (1 + e^{-x})) * (1 + out * e^{-x})) +template +struct SiluGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp1 = static_cast(1) + (-x).exp(); // 1+e^(-x) + auto temp2 = x * (-x).exp(); // x*e^(-x) + dx.device(d) = dout * ((static_cast(1) / temp1) * + (static_cast(1) + (temp2 / temp1))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + // Originally: logsigmoid(x) = -log (1 + exp(-x)) // For numerical stability, we can use the log-sum-exp trick: // https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ @@ -366,6 +391,36 @@ struct TanhGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; +template +struct TanhGradGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, const framework::Tensor* Out, + const framework::Tensor* ddX, const framework::Tensor* dOut, + framework::Tensor* dOutNew, framework::Tensor* ddOut) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad")); + auto out = framework::EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad")); + // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out + // * ddx) + if (dOutNew) { + auto dout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad")); + auto dout_new = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SquareGradGrad")); + dout_new.device(*d) = + static_cast(-1) * dout * static_cast(2) * out * ddx; + } + if (ddOut) { + auto ddout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SquareGradGrad")); + ddout.device(*d) = (static_cast(1) - out * out) * ddx; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + // tanhshrink(x) = x - tanh(x) // where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) template @@ -400,7 +455,7 @@ struct HardShrinkFunctor : public BaseActivationFunctor { void operator()(Device d, X x, Out out) const { auto temp1 = x < static_cast(threshold * -1.f); auto temp2 = x > static_cast(threshold); - out.device(d) = x * (temp1 + temp2 > 0).template cast(); + out.device(d) = x * (temp1 || temp2).template cast(); } }; @@ -417,7 +472,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor { void operator()(Device d, X x, Out out, dOut dout, dX dx) const { auto temp1 = x < static_cast(threshold * -1.f); auto temp2 = x > static_cast(threshold); - dx.device(d) = dout * (temp1 + temp2 > 0).template cast(); + dx.device(d) = dout * (temp1 || temp2).template cast(); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } @@ -1734,6 +1789,58 @@ inline void ExtractDoubleGradTensorWithInputDOut( } } +template +class TanhDoubleGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::Tensor *Out, *ddX, *dOut; + framework::Tensor *dOutNew, *ddOut; + Out = ddX = dOut = nullptr; + dOutNew = ddOut = nullptr; + + // extract ddx(input) and out(input) + auto ddx_var = ctx.InputVar("DDX"); + auto out_var = ctx.InputVar("Out"); + PADDLE_ENFORCE_NOT_NULL( + ddx_var, platform::errors::NotFound( + "Cannot get input Variable ddx, variable name = %s", + ctx.InputName("DDX"))); + PADDLE_ENFORCE_NOT_NULL( + out_var, platform::errors::NotFound( + "Cannot get input Variable out, variable name = %s", + ctx.InputName("Out"))); + ddX = ctx.Input("DDX"); + Out = ctx.Input("Out"); + + // set output ddout + auto ddout_var = ctx.OutputVar("DDOut"); + if (ddout_var) { + ddOut = ctx.Output("DDOut"); + } + + // extract dOut(intput) + auto dout_var = ctx.InputVar("DOut"); + PADDLE_ENFORCE_NOT_NULL( + dout_var, platform::errors::NotFound( + "Cannot get input Variable dout_var, variable name = %s", + ctx.InputName("DOut"))); + dOut = ctx.Input("DOut"); + + // set output dout_new + auto dout_new_var = ctx.OutputVar("DOutNew"); + if (dout_new_var) { + dOutNew = ctx.Output("DOutNew"); + } + + if (dOutNew) dOutNew->mutable_data(Out->dims(), ctx.GetPlace()); + if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); + auto& place = ctx.template device_context(); + Functor functor; + functor(place, Out, ddX, dOut, dOutNew, ddOut); + } +}; template class SquareDoubleGradKernel : public framework::OpKernel { @@ -2047,8 +2154,8 @@ struct LogGradGradFunctor : public BaseActivationFunctor { #define FOR_EACH_ACTIVATION_OP(__macro) \ __macro(sigmoid, Sigmoid, SigmoidFunctor, SigmoidGradFunctor); \ + __macro(silu, Silu, SiluFunctor, SiluGradFunctor); \ __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ - __macro(tanh, Tanh, TanhFunctor, TanhGradFunctor); \ __macro(atan, Atan, AtanFunctor, AtanGradFunctor); \ __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..f368c658230555c5a3529b39dfc1b60b1cab56e4 --- /dev/null +++ b/paddle/fluid/operators/activation_op_npu.cc @@ -0,0 +1,367 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the Licnse. */ + +#include +#include + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class PowNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + auto factor = ctx.Attr("factor"); + + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("Power", {*x}, {*out}, + {{"power", factor}, + {"scale", static_cast(1.0)}, + {"shift", static_cast(0.0)}}); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class PowGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto factor = ctx.Attr("factor"); + + auto x_dims = x->dims(); + + auto place = ctx.GetPlace(); + auto stream = + ctx.template device_context() + .stream(); + + // NOTE(liym27): dx = dout * factor * x.pow(factor-1) + + // Step1: Compute x_pow = x.pow(factor-1) + Tensor x_pow(x->type()); + x_pow.mutable_data(x->dims(), place); + auto runner_pow = NpuOpRunner("Power", {*x}, {x_pow}, + {{"power", factor - static_cast(1)}}); + runner_pow.Run(stream); + + // Step 2: Construct a broadcast factor, which has the same shape with x. + + // 2.1 Get a factor tensor with shape [1]. + Tensor factor_tensor(framework::proto::VarType::FP32); + factor_tensor.mutable_data({1}, place); + FillNpuTensorWithConstant(&factor_tensor, factor); + + // 2.2 Get the factor which has the shape with x and the same value with + // factor. + Tensor factor_bc_tensor(framework::proto::VarType::FP32); + factor_bc_tensor.mutable_data(x_dims, place); + auto runner_bc = NpuOpRunner("FillD", {factor_tensor}, {factor_bc_tensor}, + {{"dims", framework::vectorize(x_dims)}}); + runner_bc.Run(stream); + + // Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1) + Tensor x_power_mul_factor(x->type()); + x_power_mul_factor.mutable_data(x->dims(), place); + auto runner_mul_1 = + NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {}); + runner_mul_1.Run(stream); + + // Step 4: Compute dx = dout * factor * x.pow(factor-1) + dx->mutable_data(place); + auto runner_mul_2 = + NpuOpRunner("Mul", {*dout, x_power_mul_factor}, {*dx}, {}); + runner_mul_2.Run(stream); + } +}; + +template +class ReluNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("Relu", + { + *x, + }, + {*out}, {}); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class ReluGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto stream = + ctx.template device_context() + .stream(); + + dx->mutable_data(ctx.GetPlace()); + auto runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {}); + + runner.Run(stream); + } +}; + +template +class SqrtNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Sqrt", {*x}, {*out}, {}); + runner.Run(stream); + } +}; + +template +class SqrtGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto place = ctx.GetPlace(); + + dx->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto dx_runner = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {}); + dx_runner.Run(stream); + } +}; + +template +class LogNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + Tensor one(x->type()); + one.mutable_data(x->dims(), place); + auto one_runner = NpuOpRunner("OnesLike", {*x}, {one}, {}); + one_runner.Run(stream); + + Tensor sub(x->type()); + sub.mutable_data(x->dims(), place); + auto sub_runner = NpuOpRunner("Sub", {*x, one}, {sub}, {}); + sub_runner.Run(stream); + + auto out_runner = NpuOpRunner("Log1p", {sub}, {*out}, {}); + out_runner.Run(stream); + } +}; + +template +class LogGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* x = ctx.Input("X"); + + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto place = ctx.GetPlace(); + + dx->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + auto runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {}); + runner.Run(stream); + } +}; + +template +class TanhNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Tanh", {*x}, {*out}, {}); + runner.Run(stream); + } +}; + +template +class TanhGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* out = ctx.Input("Out"); + + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto place = ctx.GetPlace(); + + dx->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto dx_runner = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {}); + dx_runner.Run(stream); + } +}; + +template +class SquareNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Square", {*x}, {*out}, {}); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + pow, ops::PowNPUKernel, + ops::PowNPUKernel); + +REGISTER_OP_NPU_KERNEL( + pow_grad, ops::PowGradNPUKernel, + ops::PowGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + relu, ops::ReluNPUKernel, + ops::ReluNPUKernel); + +REGISTER_OP_NPU_KERNEL( + relu_grad, + ops::ReluGradNPUKernel, + ops::ReluGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + sqrt, ops::SqrtNPUKernel, + ops::SqrtNPUKernel); + +REGISTER_OP_NPU_KERNEL( + sqrt_grad, + ops::SqrtGradNPUKernel, + ops::SqrtGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + log, ops::LogNPUKernel, + ops::LogNPUKernel); + +REGISTER_OP_NPU_KERNEL( + log_grad, ops::LogGradNPUKernel, + ops::LogGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + tanh, ops::TanhNPUKernel, + ops::TanhNPUKernel); + +REGISTER_OP_NPU_KERNEL( + tanh_grad, + ops::TanhGradNPUKernel, + ops::TanhGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + square, ops::SquareNPUKernel, + ops::SquareNPUKernel, + ops::SquareNPUKernel); diff --git a/paddle/fluid/operators/addmm_op.h b/paddle/fluid/operators/addmm_op.h index 97e3ed9c1adda0082a7ce74cfb5d9b6cb78dde63..ecfd10d2fa6fbdbfa37bfd4f3597b8fbf0a0c7c7 100644 --- a/paddle/fluid/operators/addmm_op.h +++ b/paddle/fluid/operators/addmm_op.h @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" @@ -32,8 +33,8 @@ template using EigenTensor = framework::EigenTensor; -using Array1 = Eigen::DSizes; -using Array2 = Eigen::DSizes; +using Array1 = Eigen::DSizes; +using Array2 = Eigen::DSizes; using Tensor = framework::Tensor; @@ -105,7 +106,8 @@ class AddMMKernel : public framework::OpKernel { auto eigen_out = EigenTensor::From(*out); auto& place = *context.template device_context().eigen_device(); - eigen_out.device(place) = eigen_input.broadcast(bcast_dims); + EigenBroadcast, T, 2>::Eval( + place, eigen_out, eigen_input, bcast_dims); blas.GEMM(false, false, x_dims[0], y_dims[1], x_dims[1], alpha, x->data(), x_dims[1], y->data(), y_dims[1], beta, diff --git a/paddle/fluid/operators/amp/CMakeLists.txt b/paddle/fluid/operators/amp/CMakeLists.txt index b3ff52a7ae119ded7a305c97f3365d1a72d50acf..2ea8bbcbc61df84eb445b1a512653d66f600c46a 100644 --- a/paddle/fluid/operators/amp/CMakeLists.txt +++ b/paddle/fluid/operators/amp/CMakeLists.txt @@ -4,3 +4,7 @@ if(WITH_UNITY_BUILD) include(unity_build_rule.cmake) endif() register_operators() + +if(WITH_ASCEND_CL) + cc_test(check_finite_and_unscale_op_npu_test SRCS check_finite_and_unscale_op_npu_test.cc DEPS op_registry check_finite_and_unscale_op scope device_context enforce executor) +endif() diff --git a/paddle/fluid/operators/amp/alloc_float_status_op.cc b/paddle/fluid/operators/amp/alloc_float_status_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..181dd6eabe22d7d0c82b7c8f17625d787008f00b --- /dev/null +++ b/paddle/fluid/operators/amp/alloc_float_status_op.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class AllocFloatStatusOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasOutput("FloatStatus"), "Output", "FloatStatus", + "alloc_float_status"); + ctx->SetOutputDim("FloatStatus", {8}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::FP32, + ctx.GetPlace()); + } +}; + +class AllocFloatStatusMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddOutput("FloatStatus", + "(Tensor) of shape {8} that holds the float status."); + AddComment(R"DOC( + Produces a float Tensor that holds the float status +)DOC"); + } +}; + +template +class AllocFloatStatusKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Operator alloc_float_status is not supported on CPU")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPU = paddle::platform::CPUDeviceContext; + +REGISTER_OPERATOR( + alloc_float_status, ops::AllocFloatStatusOp, ops::AllocFloatStatusMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL(alloc_float_status, + ops::AllocFloatStatusKernel); diff --git a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..fe5b08af52a624b29100635ee34cfac7c2d2a859 --- /dev/null +++ b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc @@ -0,0 +1,47 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class AllocFloatStatusKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* float_status = ctx.Output("FloatStatus"); + float_status->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + alloc_float_status, + ops::AllocFloatStatusKernel); diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc index 9d78936ad5f7f2618eb766d84de2c631fc0cf8c5..c7520dbd34f6a92afb5c2fe320197fdad8e95379 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc @@ -60,6 +60,12 @@ class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Scale", "(Tensor) 1-dim tensor, the scale of check_finite_and_unscale " "operator."); +#ifdef PADDLE_WITH_ASCEND_CL + AddInput("FloatStatus", + "(Tensor) 1-dim tensor of shape [8], allocated by " + "alloc_float_status op") + .AsDispensable(); +#endif AddOutput("Out", "(Tensors) The scaled output tensor of " "check_finite_and_unscale operator.") diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu index 6840e4847c4c6485c2815e0634bcd7aaa16783b4..c699486a9140a388dc79359cf3cc40fc61e4f45b 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu @@ -26,18 +26,51 @@ __global__ void InverseAndMemset(const T* s, T* o, bool* found_inf) { } template -__global__ void CheckFiniteAndUnscale(const T* in, const MT* scale, int num, - bool* found_inf, T* out) { - const int idx = threadIdx.x + blockIdx.x * blockDim.x; - - if (idx < num) { - MT val = static_cast(in[idx]) * (*scale); +__global__ void CheckFiniteAndUnscale(const T** xs, const MT* scale, + int64_t size, int64_t* starts, + bool* found_inf, T** outs) { + const int64_t tid = threadIdx.x + blockIdx.x * blockDim.x; + + // copy starts array from global memory to shared memory + extern __shared__ int64_t s_starts[]; + for (int i = threadIdx.x; i <= size; i += blockDim.x) { + s_starts[i] = starts[i]; + } + __syncthreads(); + + const int64_t num = s_starts[size]; + int xs_index = 0; + bool local_found_inf = false; + const MT local_scale = *scale; + for (int64_t idx = tid; idx < num; idx += gridDim.x * blockDim.x) { + // get the "out" index of "id" + // For example: + // idx = 15, starts = [0, 10, 10, 20, 30] + // because 10 <= idx < 20 ==> + // the idx element locate in the 3rd tensor (notice the 2nd tensor size is + // 0) + int next_xs_index = xs_index; + while (idx >= s_starts[next_xs_index]) next_xs_index++; + xs_index = next_xs_index - 1; + + // get in data and out data + const T* in = xs[xs_index]; + T* out = outs[xs_index]; + int64_t in_idx = idx - s_starts[xs_index]; + + // Unscale + MT val = static_cast(in[in_idx]) * local_scale; T narrow_val = static_cast(val); - out[idx] = narrow_val; + out[in_idx] = narrow_val; + + // CheckFinite if (!isfinite(narrow_val)) { - *found_inf = true; + local_found_inf = true; } } + if (local_found_inf) { + *found_inf = true; + } } template @@ -63,20 +96,57 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel { InverseAndMemset<<<1, 1, 0, dev_ctx.stream()>>>( scale_data, inverse_scale_v, found_inf_data); - for (size_t i = 0; i < xs.size(); ++i) { - const auto* x = xs[i]; - auto* out = outs[i]; - const T* x_data = x->data(); - T* out_data = out->mutable_data(dev_ctx.GetPlace()); - - int num = x->numel(); - int block = 1024; - int grid = (num + block - 1) / block; - VLOG(3) << "launch kernel"; - CheckFiniteAndUnscale<<>>( - x_data, inverse_scale_v, num, found_inf_data, out_data); - VLOG(3) << "finish kernel"; + size_t xs_size = xs.size(); + const auto& cpu_place = platform::CPUPlace(); + // calculate each tensor's start index and copy to device + auto h_starts_tensor = + memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); + int64_t* h_starts = reinterpret_cast(h_starts_tensor->ptr()); + + auto d_starts_tensor = + memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t)); + int64_t* d_starts = reinterpret_cast(d_starts_tensor->ptr()); + + // the start index value of each tensor is + // the sum of previous tensor's size. For example: + // xs = [10, 0, 10, 10] ==> starts = [0, 10, 10, 20, 30] + h_starts[0] = 0; + for (int i = 1; i <= xs_size; i++) { + h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel(); + } + int64_t total_num = h_starts[xs_size]; + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t), + dev_ctx.stream()); + + // copy each tensor's data address to device + auto h_mem = memory::Alloc(cpu_place, 2 * xs_size * sizeof(T*)); + const T** h_xs = reinterpret_cast(h_mem->ptr()); + T** h_outs = reinterpret_cast(h_mem->ptr()) + xs_size; + + auto d_mem = memory::Alloc(dev_ctx, 2 * xs_size * sizeof(T*)); + const T** d_xs = reinterpret_cast(d_mem->ptr()); + T** d_outs = reinterpret_cast(d_mem->ptr()) + xs_size; + + for (size_t i = 0; i < xs_size; ++i) { + h_xs[i] = xs[i]->data(); + h_outs[i] = outs[i]->mutable_data(dev_ctx.GetPlace()); } + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), d_xs, + cpu_place, h_xs, 2 * xs_size * sizeof(T*), dev_ctx.stream()); + + // Launch Kernel + int threads_per_block = std::min(static_cast(1024), total_num); + int elements_per_block = + threads_per_block * 20; // each thread deal with 20 number + int blocks_per_grid = + (total_num + elements_per_block - 1) / elements_per_block; + VLOG(3) << "launch kernel"; + CheckFiniteAndUnscale< + T, MPDType><<>>( + d_xs, inverse_scale_v, xs_size, d_starts, found_inf_data, d_outs); + VLOG(3) << "finish kernel"; } }; } // namespace operators diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..8fd45326e4ec6134cf4b98be12212ce8d7d74541 --- /dev/null +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc @@ -0,0 +1,130 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +// NOTE(zhiqiu): The CheckFiniteAndUnscaleNPUKernel is different from CUDA. +// On NPU, we do not really check the data of input tensors, +// but use NPUGetFloatStatus to check whether the nan/inf occurs on device, +// and clear it after this op. +// Which may leads to wrong result if the input tensors is not calculated +// on NPU device, but got from other way, for example, feeding. +template +class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + const auto xs = ctx.MultiInput("X"); + const auto* scale = ctx.Input("Scale"); + const auto* float_status = ctx.Input("FloatStatus"); + auto outs = ctx.MultiOutput("Out"); + auto* found_inf = ctx.Output("FoundInfinite"); + + found_inf->mutable_data(ctx.GetPlace()); + + bool found_inf_data = false; + + auto stream = + ctx.template device_context() + .stream(); + + // step1: inverse scale(RealDiv) + Tensor const_tensor; + const_tensor.mutable_data({1}, ctx.GetPlace()); + FillNpuTensorWithConstant(&const_tensor, static_cast(1.0)); + + // Inverse(1.0/scale) + Tensor* tmp_inverse_out = const_cast(scale); + Tensor inverse_out(scale->type()); + inverse_out.Resize(scale->dims()); + inverse_out.mutable_data(ctx.GetPlace()); + auto runner_inverse = + NpuOpRunner("Div", {const_tensor, *scale}, {inverse_out}, {}); + runner_inverse.Run(stream); + tmp_inverse_out = &inverse_out; + + // NOTE(zhiqiu): + Tensor tmp; + tmp.mutable_data({8}, ctx.GetPlace()); + + // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place. + // tmp is only placeholder. + auto runner_float_status = + NpuOpRunner("NPUGetFloatStatus", {*float_status}, {tmp}, + {{"message", std::string("check_nan_and_inf")}}); + runner_float_status.Run(stream); + + Tensor sum; + sum.mutable_data({1}, ctx.GetPlace()); + auto runner_reduce_sum = + NpuOpRunner("ReduceSumD", {*float_status}, {sum}, + {{"axes", std::vector{0}}, {"keep_dims", true}}); + runner_reduce_sum.Run(stream); + + std::vector sum_vec; + TensorToVector( + sum, ctx.template device_context(), + &sum_vec); + found_inf_data = (sum_vec[0] > 1); + + VLOG(4) << "found_inf_data:" << found_inf_data; + + for (size_t i = 0; i < xs.size(); ++i) { + const auto* x = xs[i]; + auto* out = outs[i]; + out->mutable_data(ctx.GetPlace()); + if (!found_inf_data) { + // MatMul + auto runner_matmul = + NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {}); + runner_matmul.Run(stream); + } + } + + // set found_inf to true + VLOG(4) << "found overflow:" << found_inf_data; + Tensor found_inf_tensor; + found_inf_tensor.Resize({1}); + bool* is_found_inf = + found_inf_tensor.mutable_data(paddle::platform::CPUPlace()); + *is_found_inf = found_inf_data; + + framework::TensorCopy( + found_inf_tensor, ctx.GetPlace(), + ctx.template device_context(), found_inf); + ctx.template device_context().Wait(); + + auto runner_clear_status = + NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp}); + runner_clear_status.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_NPU_KERNEL(check_finite_and_unscale, + ops::CheckFiniteAndUnscaleNPUKernel, + ops::CheckFiniteAndUnscaleNPUKernel); diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a80b83f0cbe51fe536955b047d7be1b4c451a5a9 --- /dev/null +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc @@ -0,0 +1,131 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/enforce.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +using Tensor = paddle::framework::Tensor; + +USE_OP(check_finite_and_unscale); +USE_OP_DEVICE_KERNEL(check_finite_and_unscale, NPU); + +struct InputVars { + std::string name; + f::LoDTensor *tensor; +}; + +template +void Compare(f::Scope *scope, const p::DeviceContext &ctx) { + const f::DDim dims = f::make_ddim({2, 2}); + auto place = ctx.GetPlace(); + + // init input + std::vector input_names = { + {"x", scope->Var("x")->GetMutable()}, + {"x1", scope->Var("x1")->GetMutable()}}; + + auto *scale = scope->Var("scale")->GetMutable(); + + // init output + auto *out = scope->Var("out")->GetMutable(); + auto *out1 = scope->Var("out1")->GetMutable(); + auto *found_inf = scope->Var("found_inf")->GetMutable(); + + // Initialize input data + const int num_inputs = input_names.size(); + size_t numel = static_cast(f::product(dims)); + + for (int i = 0; i < num_inputs; ++i) { + std::vector init_xs; + for (size_t j = 0; j < numel; ++j) { + if (j == 0) { + init_xs.push_back(static_cast(NAN)); + } else { + init_xs.push_back(static_cast(j + 1)); + } + } + f::TensorFromVector(init_xs, ctx, input_names[i].tensor); + input_names[i].tensor->Resize(dims); + } + + f::TensorFromVector(std::vector{static_cast(0.5)}, ctx, scale); + + ctx.Wait(); + + // run + f::AttributeMap attrs; + auto op = f::OpRegistry::CreateOp( + "check_finite_and_unscale", {{"X", {"x", "x1"}}, {"Scale", {"scale"}}}, + {{"Out", {"out", "out1"}}, {"FoundInfinite", {"found_inf"}}}, attrs); + op->Run(*scope, place); + ctx.Wait(); + + // out0 + std::vector out_vec; + f::TensorToVector(*out, ctx, &out_vec); + EXPECT_EQ(out_vec.size(), static_cast(4)); + for (size_t j = 0; j < out_vec.size(); ++j) { + VLOG(3) << "out_vec[" << j << "]:" << out_vec[j]; + } + + ctx.Wait(); + + // out0 + std::vector out1_vec; + f::TensorToVector(*out1, ctx, &out1_vec); + EXPECT_EQ(out1_vec.size(), static_cast(4)); + for (size_t j = 0; j < out1_vec.size(); ++j) { + VLOG(3) << "out1_vec[" << j << "]:" << out1_vec[j]; + } + + ctx.Wait(); + + // out found_inf + Tensor found_inf_tensor; + found_inf_tensor.Resize({1}); + bool *found_inf_data = + found_inf_tensor.mutable_data(paddle::platform::CPUPlace()); + f::TensorCopy(*found_inf, place, &found_inf_tensor); + EXPECT_TRUE(*found_inf_data); + + ctx.Wait(); +} + +TEST(check_finite_and_unscale, NPU_fp32) { + f::Scope scope; + auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); +} + +TEST(check_finite_and_unscale, NPU_fp16) { + f::Scope scope; + auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); +} diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu index b48b0e78892933bc76894611d0ae6d01c194d036..de1f83c1ee50d00960c50638fab5fd6cffca1a36 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu @@ -34,13 +34,39 @@ __global__ void GpuUpdateLossScaling( } template -__global__ void FillIf(T* data, const int64_t num, const T value, - const bool* has_inf) { - if (*has_inf) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - for (int i = tid; i < num; i += blockDim.x * gridDim.x) { - data[i] = value; - } +__global__ void FusedFillIf(T** outs, const size_t xs_size, + const int64_t* starts, const T value, + const bool* has_inf) { + if (!(*has_inf)) return; + + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + + // copy starts array from global memory to shared memory + extern __shared__ int64_t s_starts[]; + for (int i = threadIdx.x; i <= xs_size; i += blockDim.x) { + s_starts[i] = starts[i]; + } + __syncthreads(); + + const int64_t total_num = s_starts[xs_size]; + int out_index = 0; + + for (int64_t id = tid; id < total_num; id += blockDim.x * gridDim.x) { + // get the "out" index of "id" + // For example: + // id = 15, starts = [0, 10, 10, 20, 30] + // because 10 <= id < 20 ==> + // the id element locate in the 3rd tensor (notice the 2nd tensor size is 0) + int next_out_index = out_index; + while (id >= s_starts[next_out_index]) next_out_index++; + out_index = next_out_index - 1; + + // get data pointer and index + T* out_data = outs[out_index]; + int64_t idx = id - s_starts[out_index]; + + // set value + out_data[idx] = value; } } @@ -68,15 +94,52 @@ class LazyZeros { const bool* found_inf_data, const std::vector& xs, const std::vector& outs) const { - for (size_t i = 0; i < xs.size(); ++i) { - auto* out = outs[i]; - T* out_data = out->mutable_data(dev_ctx.GetPlace()); - int64_t num = out->numel(); - int block = 1024; - int grid = (block - 1 + num) / block; - FillIf<<>>( - out_data, num, static_cast(0), found_inf_data); + size_t xs_size = xs.size(); + const auto& cpu_place = platform::CPUPlace(); + // alloc each tensor's start index and copy to device + auto h_in_starts_mem = + memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); + int64_t* h_starts = reinterpret_cast(h_in_starts_mem->ptr()); + + auto d_in_starts_mem = + memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t)); + int64_t* d_starts = reinterpret_cast(d_in_starts_mem->ptr()); + + // the start index value of each tensor is + // the sum of previous tensor's size. For example: + // outs = [10, 0, 10, 10] ==> starts = [0, 10, 10, 20, 30] + h_starts[0] = 0; + for (int i = 0; i < xs_size; i++) { + h_starts[i + 1] = h_starts[i] + outs[i]->numel(); } + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t), + dev_ctx.stream()); + + // copy each tensor of "outs" data address array to device + auto h_out_addrs_mem = memory::Alloc(cpu_place, xs_size * sizeof(T*)); + T** h_out_addrs = reinterpret_cast(h_out_addrs_mem->ptr()); + + auto d_out_addrs_mem = memory::Alloc(dev_ctx, xs_size * sizeof(T*)); + T** d_out_addrs = reinterpret_cast(d_out_addrs_mem->ptr()); + + for (size_t i = 0; i < xs_size; ++i) { + h_out_addrs[i] = outs[i]->mutable_data(dev_ctx.GetPlace()); + } + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + d_out_addrs, cpu_place, h_out_addrs, xs_size * sizeof(T*), + dev_ctx.stream()); + + // launch cuda kernel + int64_t total_num = h_starts[xs_size]; + int64_t threads_per_block = std::min(static_cast(1024), total_num); + int64_t elements_per_block = + threads_per_block * 50; // each thread deal with 50 data + int64_t blocks_per_grid = + (total_num + elements_per_block - 1) / elements_per_block; + FusedFillIf<<>>( + d_out_addrs, xs_size, d_starts, static_cast(0), found_inf_data); } }; diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..820966addfcff31d1676aedd71101a2e3c5a4332 --- /dev/null +++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc @@ -0,0 +1,247 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/amp/update_loss_scaling_op.h" +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +void Update(const platform::NPUDeviceContext& ctx, + const std::vector found_inf_vec, + const Tensor* pre_loss_scaling_tensor, const Tensor* good_in_tensor, + const Tensor* bad_in_tensor, const int incr_every_n_steps, + const int decr_every_n_nan_or_inf, const float incr_ratio, + const float decr_ratio, Tensor* updated_loss_scaling_tensor, + Tensor* good_out_tensor, Tensor* bad_out_tensor) { + auto place = ctx.GetPlace(); + auto stream = ctx.stream(); + if (found_inf_vec[0]) { + // good_out_data = 0 + auto g = good_out_tensor->mutable_data(place); + platform::NPUMemsetAsync(static_cast(g), 0, + good_out_tensor->numel() * sizeof(int), stream); + // bad_out_data = bad_in_data + 1 + Tensor factor_tensor(bad_out_tensor->type()); + factor_tensor.mutable_data({1}, place); + FillNpuTensorWithConstant(&factor_tensor, static_cast(1)); + auto runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor}, + {*bad_out_tensor}, {}); + runner_p2.Run(stream); + + std::vector bad_out_data; + TensorToVector(*bad_out_tensor, ctx, &bad_out_data); + if (bad_out_data[0] == decr_every_n_nan_or_inf) { + auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, + {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", decr_ratio}, + {"shift", static_cast(0)}}); + + runner_p3.Run(stream); + + std::vector new_loss_scaling; + TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling); + if (new_loss_scaling[0] < static_cast(1)) { + // updated_loss_scaling_data = 1 + auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, + {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", static_cast(0)}, + {"shift", static_cast(1)}}); + + runner_p4.Run(stream); + } + + // bad_out_data = 0 + auto b = bad_out_tensor->mutable_data(place); + platform::NPUMemsetAsync(static_cast(b), 0, + bad_out_tensor->numel() * sizeof(int), stream); + } + } else { + // bad_out_data = 0 + auto b = bad_out_tensor->mutable_data(place); + platform::NPUMemsetAsync(static_cast(b), 0, + bad_out_tensor->numel() * sizeof(int), stream); + + // good_out_data = good_in_data + 1 + Tensor factor_tensor(good_out_tensor->type()); + factor_tensor.mutable_data({1}, place); + FillNpuTensorWithConstant(&factor_tensor, static_cast(1)); + auto runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor}, + {*good_out_tensor}, {}); + runner_p2.Run(stream); + + std::vector good_out_data; + TensorToVector(*good_out_tensor, ctx, &good_out_data); + + if (good_out_data[0] == incr_every_n_steps) { + auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, + {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", incr_ratio}, + {"shift", static_cast(0)}}); + runner_p3.Run(stream); + + std::vector new_loss_scaling; + TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling); + if (!std::isfinite(new_loss_scaling[0])) { + // updated_loss_scaling_data = pre_loss_scaling_data + auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, + {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", static_cast(1)}, + {"shift", static_cast(0)}}); + + runner_p4.Run(stream); + } + // good_out_data = 0 + auto g = good_out_tensor->mutable_data(place); + platform::NPUMemsetAsync(static_cast(g), 0, + good_out_tensor->numel() * sizeof(int), stream); + } + } +} + +template +class UpdateLossScalingFunctor { + public: + void operator()(const platform::NPUDeviceContext& dev_ctx, + const std::vector found_inf_vec, + const Tensor* pre_loss_scaling_tensor, + const Tensor* good_in_tensor, const Tensor* bad_in_tensor, + const int incr_every_n_steps, + const int decr_every_n_nan_or_inf, const float incr_ratio, + const float decr_ratio, Tensor* updated_loss_scaling_tensor, + Tensor* good_out_tensor, Tensor* bad_out_tensor) const { + Update(dev_ctx, found_inf_vec, pre_loss_scaling_tensor, good_in_tensor, + bad_in_tensor, incr_every_n_steps, decr_every_n_nan_or_inf, + incr_ratio, decr_ratio, updated_loss_scaling_tensor, + good_out_tensor, bad_out_tensor); + } +}; + +template +class LazyZerosNPU { + public: + void operator()(const platform::NPUDeviceContext& dev_ctx, + const std::vector found_inf_vec, + const std::vector& xs, + const std::vector& outs) const { + if (!xs.size()) { + return; + } + auto place = dev_ctx.GetPlace(); + auto stream = dev_ctx.stream(); + Tensor* zero_tensor; + void* zero_ptr; + if (found_inf_vec[0]) { + int max_num = -1; + for (size_t i = 0; i < xs.size(); ++i) { + auto* out = outs[i]; + int num = out->numel(); + if (max_num < num) { + max_num = num; + zero_tensor = out; + } + } + + zero_tensor->mutable_data(place); + auto runner_zeros = + NpuOpRunner("ZerosLike", {*zero_tensor}, {*zero_tensor}); + runner_zeros.Run(stream); + zero_tensor->check_memory_size(); + zero_ptr = zero_tensor->data(); + } + + for (size_t i = 0; i < xs.size(); ++i) { + auto* out = outs[i]; + auto* x = xs[i]; + auto dst_ptr = out->mutable_data(place); + if (!found_inf_vec[0]) { + framework::TensorCopy(*x, place, dev_ctx, out); + } else if (zero_ptr != dst_ptr) { + auto size = out->numel() * framework::SizeOfType(out->type()); + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place), dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, place), zero_ptr, size, + stream); + } + } + } +}; + +template +class UpdateLossScalingNPUKernel : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + + const auto xs = ctx.MultiInput("X"); + auto outs = ctx.MultiOutput("Out"); + const auto* found_inf = ctx.Input("FoundInfinite"); + PADDLE_ENFORCE_EQ(found_inf->numel(), 1, + platform::errors::InvalidArgument( + "FoundInfinite must has only one element.")); + + std::vector found_inf_vec; + TensorToVector(*found_inf, ctx.device_context(), &found_inf_vec); + + LazyZerosNPU{}(dev_ctx, found_inf_vec, xs, outs); + const bool stop_update = ctx.Attr("stop_update"); + if (stop_update) { + return; + } + + const auto* pre_loss_scaling = ctx.Input("PrevLossScaling"); + const auto* good_in = ctx.Input("InGoodSteps"); + const auto* bad_in = ctx.Input("InBadSteps"); + auto* updated_loss_scaling = ctx.Output("LossScaling"); + auto* good_out = ctx.Output("OutGoodSteps"); + auto* bad_out = ctx.Output("OutBadSteps"); + + updated_loss_scaling->mutable_data(dev_ctx.GetPlace()); + good_out->mutable_data(dev_ctx.GetPlace()); + bad_out->mutable_data(dev_ctx.GetPlace()); + + const int incr_every_n_steps = ctx.Attr("incr_every_n_steps"); + const int decr_every_n_nan_or_inf = + ctx.Attr("decr_every_n_nan_or_inf"); + const float incr_ratio = ctx.Attr("incr_ratio"); + const float decr_ratio = ctx.Attr("decr_ratio"); + UpdateLossScalingFunctor{}( + dev_ctx, found_inf_vec, pre_loss_scaling, good_in, bad_in, + incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio, + updated_loss_scaling, good_out, bad_out); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + update_loss_scaling, + ops::UpdateLossScalingNPUKernel, + ops::UpdateLossScalingNPUKernel); diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc index add533bafcb0a7f20c76f0844fb609d7af719bb1..433cabcfee0104a1112baa4aca6c18d072d8f696 100644 --- a/paddle/fluid/operators/assign_op.cc +++ b/paddle/fluid/operators/assign_op.cc @@ -162,6 +162,7 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double, ops::AssignKernel, int, ops::AssignKernel, int64_t, ops::AssignKernel, bool, ops::AssignKernel, plat::float16, + ops::AssignKernel, plat::bfloat16, ops::AssignKernel); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/operators/assign_op_npu.cc b/paddle/fluid/operators/assign_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..93689d5e495f33484d2f05b04d25734a8c5ab07e --- /dev/null +++ b/paddle/fluid/operators/assign_op_npu.cc @@ -0,0 +1,63 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/operators/assign_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace framework { +class OpDesc; +class Variable; +} // namespace framework +namespace imperative { +class OpBase; +} // namespace imperative +namespace platform { +struct CPUPlace; +struct CUDAPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace paddle { +namespace operators { +template +class AssignNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + assign, ops::AssignNPUKernel, + ops::AssignNPUKernel, + ops::AssignNPUKernel) diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..792d01a5efe43034c201a57641cf3dc1b4c38e4c --- /dev/null +++ b/paddle/fluid/operators/assign_op_npu_test.cc @@ -0,0 +1,80 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(assign); +USE_OP_DEVICE_KERNEL(assign, NPU); + +template +void Compare(f::Scope* scope, const p::DeviceContext& ctx, + std::string op_type) { + // init + auto x = scope->Var("X"); + auto tensor_x = x->GetMutable(); + + std::vector init; + init.push_back(static_cast(1.0)); + init.push_back(static_cast(2.0)); + init.push_back(static_cast(3.0)); + init.push_back(static_cast(4.0)); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({4}); + + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("Out"); + auto tensor_out = out->GetMutable(); + + auto op = + f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}}, {{"Out", {"Out"}}}, {}); + + op->Run(*scope, place); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + ctx.Wait(); + + EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)4); + EXPECT_EQ(out_vec[0], static_cast(1.0)); + EXPECT_EQ(out_vec[1], static_cast(2.0)); + EXPECT_EQ(out_vec[2], static_cast(3.0)); + EXPECT_EQ(out_vec[3], static_cast(4.0)); +} + +TEST(assign, NPU_fp32) { + f::Scope scope; + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx, "assign"); +} diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index fc31885824b55f22bba77559d728a1e40d47e784..edad20435b41c9eb59c3df793c00ab3bfe96771b 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -575,7 +575,7 @@ class BatchNormGradKernel // SavedVariance have been reverted in forward operator const auto *saved_inv_variance = ctx.Input("SavedVariance"); const std::string data_layout_str = ctx.Attr("data_layout"); - const bool use_global_stats = ctx.Attr("use_global_stats"); + bool use_global_stats = ctx.Attr("use_global_stats"); const bool is_test = ctx.Attr("is_test"); const float epsilon = ctx.Attr("epsilon"); const DataLayout data_layout = @@ -585,6 +585,8 @@ class BatchNormGradKernel auto *d_scale = ctx.Output(framework::GradVarName("Scale")); auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + use_global_stats = is_test || use_global_stats; + // batch_norm with inplace as false will take X as grad input, which // is same as cuDNN batch_norm backward calculation, batch_norm // with inplace as true only take Y as input and X should be calculate @@ -605,13 +607,6 @@ class BatchNormGradKernel "X@GRAD and Y@GRAD inplaced in non-inplace mode")); } - PADDLE_ENFORCE_EQ( - is_test, false, - platform::errors::InvalidArgument( - "`is_test = True` CANNOT be used in train program. If " - "you want to use global status in pre_train model, " - "please set `use_global_stats = True`")); - // Get the size for each dimension. // NCHW [batch_size, in_channels, in_height, in_width] const auto &x_dims = x->dims(); diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index 444c24b826b1b89175136e650aab15ad7b8b2881..6fc78732b1063af04a34de5d690a4f2ed75978f2 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -41,6 +41,83 @@ using CudnnDataType = platform::CudnnDataType; template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; +template +static __global__ void BNForwardInference( + const T *x, const BatchNormParamType *mean, + const BatchNormParamType *variance, const BatchNormParamType *scale, + const BatchNormParamType *bias, const int C, const int N, const int HxW, + const double epsilon, T *y) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int num = N * C * HxW; + for (int i = gid; i < num; i += stride) { + const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; + BatchNormParamType x_sub_mean = + static_cast>(x[i]) - mean[c]; + BatchNormParamType inv_var = 1 / sqrt(variance[c] + epsilon); + y[i] = static_cast(scale[c] * x_sub_mean * inv_var + bias[c]); + } +} + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining( + const T *x, const BatchNormParamType *scale, + const BatchNormParamType *bias, const int C, const int N, const int HxW, + const double epsilon, double exponentialAverageFactor, T *y, + BatchNormParamType *mean, BatchNormParamType *variance, + BatchNormParamType *save_mean, + BatchNormParamType *save_inv_variance) { + int outer_size = C; + int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage mean_storage; + __shared__ typename BlockReduce::TempStorage variance_storeage; + __shared__ BatchNormParamType mean_val; + __shared__ BatchNormParamType variance_val; + __shared__ BatchNormParamType inv_var_val; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType x_sum = static_cast>(0); + BatchNormParamType x_square_sum = static_cast>(0); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_i = static_cast>(x[index]); + x_sum += x_i; + x_square_sum += x_i * x_i; + } + x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); + x_square_sum = + BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); + if (threadIdx.x == 0) { + mean_val = x_sum / inner_size; + variance_val = x_square_sum / inner_size - mean_val * mean_val; + inv_var_val = 1 / sqrt(variance_val + epsilon); + + if (save_mean && save_inv_variance) { + save_mean[i] = mean_val; + save_inv_variance[i] = inv_var_val; + } + mean[i] = (1 - exponentialAverageFactor) * mean_val + + exponentialAverageFactor * mean[i]; + variance[i] = (1 - exponentialAverageFactor) * variance_val + + exponentialAverageFactor * variance[i]; + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_sub_mean = + static_cast>(x[index]) - mean_val; + y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i]; + } + } +} + template class BatchNormKernel : public framework::OpKernel { @@ -80,8 +157,12 @@ class BatchNormKernel auto dtype = platform::CudnnDataType::type; #ifdef PADDLE_WITH_HIP - // HIP do not support compute format of NHWC - auto compute_format = DataLayout::kNCHW; + auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC + : DataLayout::kNCHW; + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// HIP do not support compute format of NHWC +// auto compute_format = DataLayout::kNCHW; #else const bool fast_nhwc_batch_norm = test_mode || @@ -111,14 +192,15 @@ class BatchNormKernel // ------------------- cudnn descriptors --------------------- #ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t data_desc_; - miopenTensorDescriptor_t bn_param_desc_; - miopenBatchNormMode_t mode_; - - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// miopenTensorDescriptor_t data_desc_; +// miopenTensorDescriptor_t bn_param_desc_; +// miopenBatchNormMode_t mode_; + +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); #else cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t bn_param_desc_; @@ -138,7 +220,8 @@ class BatchNormKernel epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); #ifdef PADDLE_WITH_HIP - mode_ = miopenBNSpatial; +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// mode_ = miopenBNSpatial; #elif CUDNN_VERSION_MIN(7, 0, 1) if (FLAGS_cudnn_batchnorm_spatial_persistent) { mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; @@ -161,14 +244,15 @@ class BatchNormKernel } #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), - const_cast(strides.data()))); - // Note: PERSISTENT not implemented for inference - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDeriveBNTensorDescriptor( - bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( +// data_desc_, CudnnDataType::type, +// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), +// const_cast(strides.data()))); +// Note: PERSISTENT not implemented for inference +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDeriveBNTensorDescriptor( +// bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_)); #else PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, @@ -226,28 +310,53 @@ class BatchNormKernel C, est_var->dims()[0], est_var->dims())); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenBatchNormalizationForwardInference( - handle, miopenBNSpatial, - const_cast( - static_cast(CudnnDataType::kOne())), - const_cast( - static_cast(CudnnDataType::kZero())), - data_desc_, - static_cast(transformed_x.template data()), - data_desc_, - static_cast( - transformed_y.template mutable_data(ctx.GetPlace())), - bn_param_desc_, - const_cast(static_cast( - scale->template data>())), - const_cast(static_cast( - bias->template data>())), - const_cast(static_cast( - est_mean->template data>())), - const_cast(static_cast( - est_var->template data>())), - epsilon)); + const int block_size = 256; + const int grid_size = (N * C * H * W * D + block_size - 1) / block_size; + if (compute_format == DataLayout::kNCHW) { + BNForwardInference< + T, + DataLayout::kNCHW><<>>( + transformed_x.template data(), + est_mean->template data>(), + est_var->template data>(), + scale->template data>(), + bias->template data>(), C, N, H * W * D, + epsilon, transformed_y.template data()); + } else { + BNForwardInference< + T, + DataLayout::kNHWC><<>>( + transformed_x.template data(), + est_mean->template data>(), + est_var->template data>(), + scale->template data>(), + bias->template data>(), C, N, H * W * D, + epsilon, transformed_y.template data()); + } + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenBatchNormalizationForwardInference( +// handle, miopenBNSpatial, +// const_cast( +// static_cast(CudnnDataType::kOne())), +// const_cast( +// static_cast(CudnnDataType::kZero())), +// data_desc_, +// static_cast(transformed_x.template data()), +// data_desc_, +// static_cast( +// transformed_y.template mutable_data(ctx.GetPlace())), +// bn_param_desc_, +// const_cast(static_cast( +// scale->template data>())), +// const_cast(static_cast( +// bias->template data>())), +// const_cast(static_cast( +// est_mean->template data>())), +// const_cast(static_cast( +// est_var->template data>())), +// epsilon)); #else PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnBatchNormalizationForwardInference( @@ -365,34 +474,66 @@ class BatchNormKernel #endif // CUDNN_VERSION_MIN(7, 4, 1) if (!called) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenBatchNormalizationForwardTraining( - handle, mode_, const_cast(static_cast( - CudnnDataType::kOne())), - const_cast( - static_cast(CudnnDataType::kZero())), - data_desc_, - static_cast(transformed_x.template data()), - data_desc_, - static_cast( - transformed_y.template mutable_data(ctx.GetPlace())), - bn_param_desc_, - const_cast(static_cast( - scale->template data>())), - const_cast(static_cast( - bias->template data>())), - this_factor, - static_cast( - mean_out->template mutable_data>( - ctx.GetPlace())), - static_cast(variance_out->template mutable_data< - BatchNormParamType>(ctx.GetPlace())), - epsilon, - static_cast( - saved_mean->template mutable_data>( - ctx.GetPlace())), - static_cast(saved_variance->template mutable_data< - BatchNormParamType>(ctx.GetPlace())))); + const int num = transformed_x.numel(); + const int block = 256; + const int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = std::min(C, max_blocks); + if (compute_format == DataLayout::kNCHW) { + BNForwardTraining< + T, block, + DataLayout::kNCHW><<>>( + transformed_x.template data(), + scale->template data>(), + bias->template data>(), C, N, H * W * D, + epsilon, this_factor, transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>()); + } else { + BNForwardTraining< + T, block, + DataLayout::kNHWC><<>>( + transformed_x.template data(), + scale->template data>(), + bias->template data>(), C, N, H * W * D, + epsilon, this_factor, transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>()); + } + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenBatchNormalizationForwardTraining( +// handle, mode_, const_cast(static_cast( +// CudnnDataType::kOne())), +// const_cast( +// static_cast(CudnnDataType::kZero())), +// data_desc_, +// static_cast(transformed_x.template data()), +// data_desc_, +// static_cast( +// transformed_y.template mutable_data(ctx.GetPlace())), +// bn_param_desc_, +// const_cast(static_cast( +// scale->template data>())), +// const_cast(static_cast( +// bias->template data>())), +// this_factor, +// static_cast( +// mean_out->template mutable_data>( +// ctx.GetPlace())), +// static_cast(variance_out->template mutable_data< +// BatchNormParamType>(ctx.GetPlace())), +// epsilon, +// static_cast( +// saved_mean->template mutable_data>( +// ctx.GetPlace())), +// static_cast(saved_variance->template mutable_data< +// BatchNormParamType>(ctx.GetPlace())))); #else PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnBatchNormalizationForwardTraining( @@ -423,11 +564,12 @@ class BatchNormKernel ctx, &transformed_y, y); } #ifdef PADDLE_WITH_HIP - // clean when exit. - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// clean when exit. +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); #else // clean when exit. PADDLE_ENFORCE_CUDA_SUCCESS( @@ -439,7 +581,7 @@ class BatchNormKernel }; template -static __global__ void KeBNBackwardScaleBias( +static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias( const T *dy, const T *x, const BatchNormParamType *mean, const BatchNormParamType *variance, const double epsilon, const int N, const int C, const int HxW, BatchNormParamType *dscale, @@ -526,13 +668,97 @@ class InplaceHelper { }; template -static __global__ void BNBackwardData(const T *dy, - const BatchNormParamType *scale, - const BatchNormParamType *mean, - const T *x, - const BatchNormParamType *variance, - const int C, const int N, const int HxW, - T *dx) { +static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward( + const T *dy, const T *x, const BatchNormParamType *scale, + const BatchNormParamType *saved_mean, + const BatchNormParamType *saved_inv_variance, const int C, const int N, + const int HxW, const double epsilon, T *dx, BatchNormParamType *dscale, + BatchNormParamType *dbias) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage ds_storage; + __shared__ typename BlockReduce::TempStorage db_storage; + __shared__ typename BlockReduce::TempStorage mean_storage; + __shared__ typename BlockReduce::TempStorage variance_storeage; + __shared__ BatchNormParamType inv_var_val; + __shared__ BatchNormParamType mean_val; + __shared__ BatchNormParamType dscale_val; + __shared__ BatchNormParamType dbias_val; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType ds_sum = static_cast>(0); + BatchNormParamType db_sum = static_cast>(0); + + if (saved_mean && saved_inv_variance) { + if (threadIdx.x == 0) { + inv_var_val = saved_inv_variance[i]; + mean_val = saved_mean[i]; + } + } else { + BatchNormParamType x_sum = static_cast>(0); + BatchNormParamType x_square_sum = + static_cast>(0); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_i = + static_cast>(x[index]); + x_sum += x_i; + x_square_sum += x_i * x_i; + } + x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); + x_square_sum = + BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); + if (threadIdx.x == 0) { + mean_val = x_sum / inner_size; + inv_var_val = + 1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon); + } + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType dy_i = + static_cast>(dy[index]); + ds_sum += + dy_i * (static_cast>(x[index]) - mean_val); + db_sum += dy_i; + } + ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); + db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); + if (threadIdx.x == 0) { + dscale_val = ds_sum * inv_var_val; + dbias_val = db_sum; + dscale[i] = dscale_val; + dbias[i] = dbias_val; + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + dx[index] = scale[i] * inv_var_val * + (static_cast>(dy[index]) - + dbias_val / static_cast>(inner_size) - + (static_cast>(x[index]) - mean_val) * + inv_var_val * dscale_val / inner_size); + } + } +} + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData( + const T *dy, const BatchNormParamType *scale, + const BatchNormParamType *mean, const T *x, + const BatchNormParamType *variance, const int C, const int N, + const int HxW, T *dx) { const int outer_size = C; const int inner_size = N * HxW; typedef cub::BlockReduce, BlockDim> BlockReduce; @@ -567,7 +793,6 @@ static __global__ void BNBackwardData(const T *dy, dy_x_sub_mean_sum_val = dy_x_sub_mean_sum; } __syncthreads(); - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { const int index = layout == framework::DataLayout::kNCHW ? (j / HxW * C + i) * HxW + j % HxW @@ -592,7 +817,7 @@ class BatchNormGradKernel platform::errors::InvalidArgument("It must use CUDAPlace.")); double epsilon = static_cast(ctx.Attr("epsilon")); const std::string data_layout_str = ctx.Attr("data_layout"); - const bool use_global_stats = ctx.Attr("use_global_stats"); + bool use_global_stats = ctx.Attr("use_global_stats"); const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); @@ -625,12 +850,7 @@ class BatchNormGradKernel } const bool is_test = ctx.Attr("is_test"); - PADDLE_ENFORCE_EQ( - is_test, false, - platform::errors::InvalidArgument( - "`is_test = True` CANNOT be used in train program. If " - "you want to use global status in pre_train model, " - "please set `use_global_stats = True`")); + use_global_stats = is_test || use_global_stats; const auto &x_dims = x->dims(); @@ -668,8 +888,12 @@ class BatchNormGradKernel auto dtype = platform::CudnnDataType::type; const auto *reserve_space = ctx.Input("ReserveSpace"); #ifdef PADDLE_WITH_HIP - // HIP do not support compute format of NHWC - auto compute_format = DataLayout::kNCHW; + auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC + : DataLayout::kNCHW; + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// HIP do not support compute format of NHWC +// auto compute_format = DataLayout::kNCHW; #else const bool fast_nhwc_batch_norm = dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent && @@ -714,7 +938,11 @@ class BatchNormGradKernel auto &dev_ctx = ctx.template device_context(); const int num = transformed_x.numel(); +#ifdef HIPCC + const int block = 256; +#else const int block = 512; +#endif int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); const int max_blocks = std::max(max_threads / block, 1); int grid1 = (num + block - 1) / block; @@ -734,14 +962,15 @@ class BatchNormGradKernel // ------------------- cudnn descriptors --------------------- #ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t data_desc_; - miopenTensorDescriptor_t bn_param_desc_; - miopenBatchNormMode_t mode_; - - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// miopenTensorDescriptor_t data_desc_; +// miopenTensorDescriptor_t bn_param_desc_; +// miopenBatchNormMode_t mode_; + +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); #else cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t bn_param_desc_; @@ -759,7 +988,8 @@ class BatchNormGradKernel } epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); #ifdef PADDLE_WITH_HIP - mode_ = miopenBNSpatial; +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// mode_ = miopenBNSpatial; #elif CUDNN_VERSION_MIN(7, 0, 1) if (FLAGS_cudnn_batchnorm_spatial_persistent) { mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; @@ -771,13 +1001,14 @@ class BatchNormGradKernel #endif // CUDNN_VERSION_MIN(7, 0, 1) #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), - const_cast(strides.data()))); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_, - data_desc_, mode_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( +// data_desc_, CudnnDataType::type, +// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), +// const_cast(strides.data()))); +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_, +// data_desc_, mode_)); #else PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, @@ -871,20 +1102,49 @@ class BatchNormGradKernel #endif // CUDNN_VERSION_MIN(7, 4, 1) if (!called) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenBatchNormalizationBackward( - dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), - CudnnDataType::kZero(), CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_d_y.template data(), data_desc_, - transformed_d_x.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - d_scale->template mutable_data>( - ctx.GetPlace()), - d_bias->template mutable_data>( - ctx.GetPlace()), - epsilon, saved_mean_data, saved_var_data)); + if (compute_format == DataLayout::kNCHW) { + BNBackward< + T, block, + DataLayout::kNCHW><<>>( + transformed_d_y.template data(), + transformed_x.template data(), + scale->template data>(), saved_mean_data, + saved_var_data, C, N, H * W * D, epsilon, + transformed_d_x.template data(), + d_scale->template mutable_data>( + ctx.GetPlace()), + d_bias->template mutable_data>( + ctx.GetPlace())); + } else { + BNBackward< + T, block, + DataLayout::kNHWC><<>>( + transformed_d_y.template data(), + transformed_x.template data(), + scale->template data>(), saved_mean_data, + saved_var_data, C, N, H * W * D, epsilon, + transformed_d_x.template data(), + d_scale->template mutable_data>( + ctx.GetPlace()), + d_bias->template mutable_data>( + ctx.GetPlace())); + } + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenBatchNormalizationBackward( +// dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), +// CudnnDataType::kZero(), CudnnDataType::kOne(), +// CudnnDataType::kZero(), data_desc_, +// transformed_x.template data(), data_desc_, +// transformed_d_y.template data(), data_desc_, +// transformed_d_x.template mutable_data(ctx.GetPlace()), +// bn_param_desc_, scale->template data>(), +// d_scale->template mutable_data>( +// ctx.GetPlace()), +// d_bias->template mutable_data>( +// ctx.GetPlace()), +// epsilon, saved_mean_data, saved_var_data)); #else PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnBatchNormalizationBackward( @@ -931,11 +1191,12 @@ class BatchNormGradKernel } #ifdef PADDLE_WITH_HIP - // clean when exit. - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// clean when exit. +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); #else // clean when exit. PADDLE_ENFORCE_CUDA_SUCCESS( diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu index 99153101fc326ca467ef47ac4733457fbd8335da..8bd2b7fe2d127c379e27e42f4c189c90fb7ebf8e 100644 --- a/paddle/fluid/operators/bce_loss_op.cu +++ b/paddle/fluid/operators/bce_loss_op.cu @@ -32,6 +32,11 @@ __global__ void GPUBCELossForward(const T* x_data, const T* label_data, T one = static_cast(1.); T neg_100 = static_cast(-100.); + PADDLE_ENFORCE( + (x >= static_cast(0)) && (x <= one), + "Input is expected to be within the interval [0, 1], but recieved %f.", + x); + T term1 = max(real_log(x), neg_100); T term2 = max(real_log(one - x), neg_100); @@ -64,29 +69,13 @@ class BCELossCUDAKernel : public framework::OpKernel { auto* labels = ctx.Input("Label"); auto* out = ctx.Output("Out"); - auto x_data = x->data(); - auto out_data = out->mutable_data(ctx.GetPlace()); + const auto* x_data = x->data(); + auto* out_data = out->mutable_data(ctx.GetPlace()); auto x_numel = x->numel(); - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), x_numel); - - Tensor x_cpu; - framework::TensorCopy(*x, platform::CPUPlace(), &x_cpu); - T* x_cpu_data = x_cpu.data(); - - for (int64_t i = 0; i < x_numel; ++i) { - PADDLE_ENFORCE_GE( - x_cpu_data[i], static_cast(0), - platform::errors::InvalidArgument( - "Illegal input, input must be greater than or equal to 0")); - PADDLE_ENFORCE_LE( - x_cpu_data[i], static_cast(1), - platform::errors::InvalidArgument( - "Illegal input, input must be less than or equal to 1")); - } - auto& dev_ctx = ctx.cuda_device_context(); + platform::GpuLaunchConfig config = + platform::GetGpuLaunchConfig1D(dev_ctx, x_numel); GPUBCELossForward<<>>(x_data, labels->data(), @@ -102,9 +91,10 @@ class BCELossGradCUDAKernel : public framework::OpKernel { auto* labels = ctx.Input("Label"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); - auto dx_data = dx->mutable_data(ctx.GetPlace()); int x_numel = x->numel(); + auto* dx_data = dx->mutable_data(ctx.GetPlace()); + auto& dev_ctx = ctx.cuda_device_context(); platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(dev_ctx, x_numel); diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc index c5cfa7a3bafce7b079cacaca4e57764439d4b282..40f4b969ec060d8453d176db67a6eb20933c6b3e 100644 --- a/paddle/fluid/operators/cast_op.cc +++ b/paddle/fluid/operators/cast_op.cc @@ -97,5 +97,6 @@ REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel, ops::CastOpKernel, ops::CastOpKernel, ops::CastOpKernel, + ops::CastOpKernel, ops::CastOpKernel, ops::CastOpKernel); diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h index 8fa0416049f8fa128d7ab61f8350b41960f07263..cd60c7707cb0aaedd526480c088fabef88b5079f 100644 --- a/paddle/fluid/operators/cast_op.h +++ b/paddle/fluid/operators/cast_op.h @@ -15,7 +15,6 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/transform.h" diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..0de0f5e4505795f69f1d80e2bbc1600250fc7391 --- /dev/null +++ b/paddle/fluid/operators/cast_op_npu.cc @@ -0,0 +1,100 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/cast_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +static std::map + DTYPE_2_ACL_DTYPE = { + {framework::proto::VarType::BOOL, ACL_BOOL}, + {framework::proto::VarType::INT16, ACL_INT16}, + {framework::proto::VarType::INT32, ACL_INT32}, + {framework::proto::VarType::INT64, ACL_INT64}, + {framework::proto::VarType::FP16, ACL_FLOAT16}, + {framework::proto::VarType::FP32, ACL_FLOAT}, + {framework::proto::VarType::FP64, ACL_DOUBLE}, +}; + +using Tensor = framework::Tensor; + +template +class CastNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + int dtype = ctx.Attr("out_dtype"); + auto* out = ctx.Output("Out"); + auto place = ctx.GetPlace(); + + if (x->type() == dtype) { + // NOTE(zhiqiu): NPU cast op may result in wrong value, so + // add special case here. + VLOG(4) << "cast to same dtype:" << dtype; + out->mutable_data(place, x->type()); + framework::TensorCopy( + *x, ctx.GetPlace(), + ctx.template device_context(), out); + return; + } + + auto iter = DTYPE_2_ACL_DTYPE.find( + static_cast(dtype)); + int aclDtype = iter->second; + + if (dtype == framework::proto::VarType::FP32) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::FP16) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::INT16) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::INT32) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::INT64) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::FP64) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::BOOL) { + out->mutable_data(place); + } + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Cast", {*x}, {*out}, + {{"dst_type", static_cast(aclDtype)}}); + runner.Run(stream); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + cast, ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel); diff --git a/paddle/fluid/operators/cast_op_xpu.cc b/paddle/fluid/operators/cast_op_xpu.cc index bbd43274a002d85e38d05aa00dcc79f1e11308f7..ca15858cf67d756fc8eb41f4e26a2e0b923abef6 100644 --- a/paddle/fluid/operators/cast_op_xpu.cc +++ b/paddle/fluid/operators/cast_op_xpu.cc @@ -23,8 +23,22 @@ limitations under the License. */ namespace paddle { namespace operators { +template +class XPUFPTypeTrait { + public: + using Type = T; +}; + +template <> +class XPUFPTypeTrait { + public: + using Type = float16; +}; + template class CastXPUKernel : public framework::OpKernel { + using XPUInTDType = typename XPUFPTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& context) const override { auto* in = context.Input("X"); @@ -34,27 +48,39 @@ class CastXPUKernel : public framework::OpKernel { auto out_type = static_cast( context.Attr("out_dtype")); auto* in_data = in->data(); + + // using XPUOutTDType = typename XPUFPTypeTrait::Type; auto numel = in->numel(); auto& dev_ctx = context.template device_context(); int r = -1; if (out_type == framework::proto::VarType::FP32) { auto* out_data = out->mutable_data(context.GetPlace()); - r = xpu::cast_v2(dev_ctx.x_context(), in_data, out_data, - numel); + r = xpu::cast_v2( + dev_ctx.x_context(), reinterpret_cast(in_data), + out_data, numel); } else if (out_type == framework::proto::VarType::INT32) { auto* out_data = out->mutable_data(context.GetPlace()); - r = xpu::cast_v2(dev_ctx.x_context(), in_data, out_data, - numel); + r = xpu::cast_v2( + dev_ctx.x_context(), reinterpret_cast(in_data), + out_data, numel); } else if (out_type == framework::proto::VarType::INT64) { auto* out_data = out->mutable_data(context.GetPlace()); - r = xpu::cast_v2(dev_ctx.x_context(), in_data, out_data, - numel); + r = xpu::cast_v2( + dev_ctx.x_context(), reinterpret_cast(in_data), + out_data, numel); } else if ((out_type == framework::proto::VarType::BOOL) && (in_type == framework::proto::VarType::FP32)) { auto* out_data = out->mutable_data(context.GetPlace()); r = xpu::cast_v2( dev_ctx.x_context(), (const float*)in_data, reinterpret_cast(out_data), numel); + } else if (out_type == framework::proto::VarType::FP16) { + auto* out_data = + out->mutable_data(context.GetPlace()); + r = xpu::cast_v2( + dev_ctx.x_context(), reinterpret_cast(in_data), + reinterpret_cast(out_data), numel); + } else { PADDLE_THROW(platform::errors::Unavailable("Not supported cast %d -> %d", in_type, out_type)); @@ -75,5 +101,7 @@ namespace ops = paddle::operators; REGISTER_OP_XPU_KERNEL( cast, ops::CastXPUKernel, ops::CastXPUKernel, + ops::CastXPUKernel, ops::CastXPUKernel); #endif diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc index eb27df8a36757cd01636d176655011739eae1d56..7176a0466bb831cdbbaf66dfbb2d2625bdbf66cf 100644 --- a/paddle/fluid/operators/clip_op.cc +++ b/paddle/fluid/operators/clip_op.cc @@ -145,10 +145,14 @@ REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad, ops::ClipGradInplaceInferer, ops::ClipDoubleGradOpMaker); REGISTER_OP_CPU_KERNEL( clip, ops::ClipKernel, - ops::ClipKernel); + ops::ClipKernel, + ops::ClipKernel, + ops::ClipKernel); REGISTER_OP_CPU_KERNEL( clip_grad, ops::ClipGradKernel, - ops::ClipGradKernel); + ops::ClipGradKernel, + ops::ClipGradKernel, + ops::ClipGradKernel); REGISTER_OP_VERSION(clip) .AddCheckpoint( diff --git a/paddle/fluid/operators/clip_op.cu b/paddle/fluid/operators/clip_op.cu index d31b81c13c5cf64560d6d38c994b28515813a664..fd61e4ea61d4ff20656dea842b02958c8c2701b9 100644 --- a/paddle/fluid/operators/clip_op.cu +++ b/paddle/fluid/operators/clip_op.cu @@ -17,8 +17,12 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( clip, ops::ClipKernel, - ops::ClipKernel); + ops::ClipKernel, + ops::ClipKernel, + ops::ClipKernel); REGISTER_OP_CUDA_KERNEL( clip_grad, ops::ClipGradKernel, - ops::ClipGradKernel); + ops::ClipGradKernel, + ops::ClipGradKernel, + ops::ClipGradKernel); diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt index 8920541b9b9dcc5c52d27804262bd9c5169444ea..3f210219608fb7efa740ce2d4a52c736acdfdcc9 100644 --- a/paddle/fluid/operators/collective/CMakeLists.txt +++ b/paddle/fluid/operators/collective/CMakeLists.txt @@ -11,7 +11,7 @@ foreach(src ${OPS}) set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS}) endforeach() -register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) +register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op c_gen_hccl_id_op gen_hccl_id_op DEPS ${COLLECTIVE_DEPS}) if(WITH_NCCL OR WITH_RCCL) set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper) @@ -29,5 +29,38 @@ if(WITH_XPU_BKCL) op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS}) endif() +if(WITH_ASCEND_CL) + cc_library(gen_hccl_id_op_helper SRCS gen_hccl_id_op_helper.cc DEPS dynload_warpctc dynamic_loader scope) + set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper gen_hccl_id_op_helper) + op_library(c_gen_hccl_id_op DEPS ${COLLECTIVE_DEPS}) + op_library(gen_hccl_id_op DEPS ${COLLECTIVE_DEPS}) +endif() + set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE) set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency") + +if(WITH_ASCEND_CL) + set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hccl_op c_gen_hccl_id_op gen_hccl_id_op_helper + gen_hccl_id_op op_registry ascend_hccl flags + dynamic_loader dynload_warpctc scope device_context enforce executor) + cc_test(c_broadcast_op_npu_test SRCS c_broadcast_op_npu_test.cc + DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(c_allreduce_sum_op_npu_test SRCS c_allreduce_sum_op_npu_test.cc + DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(c_reducescatter_op_npu_test SRCS c_reducescatter_op_npu_test.cc + DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(c_allgather_op_npu_test SRCS c_allgather_op_npu_test.cc + DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc + DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc + DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(send_v2_op_npu_test SRCS send_v2_op_npu_test.cc + DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc + DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc + DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor) + cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc + DEPS op_registry elementwise_add_op c_sync_calc_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor) +endif() diff --git a/paddle/fluid/operators/collective/allreduce_op.cc b/paddle/fluid/operators/collective/allreduce_op.cc index 86f1c28a9dd4f53400418c93f8598b7a9c38f4cc..63b135a74cf4b7b80b8baec462aa920fce370f7e 100644 --- a/paddle/fluid/operators/collective/allreduce_op.cc +++ b/paddle/fluid/operators/collective/allreduce_op.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include // NOLINT #include -#include "paddle/fluid/operators/distributed_ops/allreduce_op.h" +#include "paddle/fluid/operators/collective/allreduce_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/collective/allreduce_op.cu.cc b/paddle/fluid/operators/collective/allreduce_op.cu.cc index 9b70f78399026b9f853b8315f0acf6dbad64242a..fe2e49105527065a8ccfd9e0b00cb88a1f879304 100644 --- a/paddle/fluid/operators/collective/allreduce_op.cu.cc +++ b/paddle/fluid/operators/collective/allreduce_op.cu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/distributed_ops/allreduce_op.h" +#include "paddle/fluid/operators/collective/allreduce_op.h" namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/collective/alltoall_op.cc b/paddle/fluid/operators/collective/alltoall_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1c57b9f99676337c88d6a51927195eeedb8b0a2a --- /dev/null +++ b/paddle/fluid/operators/collective/alltoall_op.cc @@ -0,0 +1,94 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/alltoall_op.h" + +namespace paddle { +namespace operators { + +class AllToAllOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "AllToAll"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "AllToAll"); + int ring_id = ctx->Attrs().Get("ring_id"); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for alltoall op must be non-negative.", ring_id)); + framework::DDim dim = ctx->GetInputDim("X"); + if (dim[0] < 0) dim[0] = -1; + ctx->SetOutputDim("Out", dim); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } +}; + +class AllToAllOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor) tensor send."); + AddOutput("Out", "(Tensor) the result of alltoall."); + AddAttr("ring_id", "(int default 0) nccl communication ring id.") + .SetDefault(0); + AddAttr( + "use_calc_stream", + "(bool default false) eject CUDA operations to calculation stream.") + .SetDefault(false); + AddComment(R"DOC( +AllToAll Operator +Scatter tensors from all participators to all participators. +)DOC"); + } +}; + +template +class AllToAllOpGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr retv) const override { + retv->SetType("alltoall"); + retv->SetInput("X", this->OutputGrad("Out")); + retv->SetOutput("Out", this->InputGrad("X")); + retv->SetAttrMap(this->Attrs()); + } +}; + +DECLARE_INPLACE_OP_INFERER(AllToAllInplaceInferer, {"X", "Out"}); + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR(alltoall, ops::AllToAllOp, ops::AllToAllOpMaker, + ops::AllToAllOpGradMaker, + ops::AllToAllOpGradMaker, + ops::AllToAllInplaceInferer) + +REGISTER_OP_CPU_KERNEL(alltoall, ops::AllToAllOpCPUKernel, + ops::AllToAllOpCPUKernel, + ops::AllToAllOpCPUKernel, + ops::AllToAllOpCPUKernel, + ops::AllToAllOpCPUKernel); diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..1bcb47fc686cfe4b93420697b15d0c2585f0358e --- /dev/null +++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc @@ -0,0 +1,95 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/alltoall_op.h" + +#if defined(PADDLE_WITH_NCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class AllToAllOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_NCCL) +#if NCCL_VERSION_CODE >= 2703 + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + int send_numel = x->numel(); + ncclDataType_t dtype = platform::ToNCCLDataType(x->type()); + + int ring_id = ctx.Attr("ring_id"); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for alltoall op must be non-negative.", ring_id)); + auto place = ctx.GetPlace(); + auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); + int nranks = comm->nranks(); + + cudaStream_t stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + framework::DDim x_dims = x->dims(); + framework::DDim out_dims(x_dims); + PADDLE_ENFORCE_EQ( + x_dims[0] % nranks, 0, + platform::errors::InvalidArgument( + "The first dimension size (%d) of the input tensor must be " + "divisible by the number of ranks (%d).", + x_dims[0], nranks)); + auto send_buf = x->data(); + auto recv_buf = out->mutable_data(out_dims, place); + size_t offset = 0; + send_numel /= nranks; + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart()); + for (auto i = 0; i < nranks; ++i) { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend( + send_buf + offset, send_numel, dtype, i, comm->comm(), stream)); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclRecv( + recv_buf + offset, send_numel, dtype, i, comm->comm(), stream)); + offset += send_numel; + } + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd()); +#else + PADDLE_THROW( + platform::errors::Unavailable("NCCL version >= 2.7.3 is needed.")); +#endif +#else + PADDLE_THROW( + platform::errors::Unavailable("PaddlePaddle should compile with GPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(alltoall, ops::AllToAllOpCUDAKernel, + ops::AllToAllOpCUDAKernel, + ops::AllToAllOpCUDAKernel, + ops::AllToAllOpCUDAKernel, + ops::AllToAllOpCUDAKernel); diff --git a/paddle/fluid/operators/distributed_ops/split_byref_op.h b/paddle/fluid/operators/collective/alltoall_op.h similarity index 51% rename from paddle/fluid/operators/distributed_ops/split_byref_op.h rename to paddle/fluid/operators/collective/alltoall_op.h index fedd7218dd6cc9481e94a92a3820cafbe4157bd0..61eec44093794ccaf820d257d7c2c6b363e10391 100644 --- a/paddle/fluid/operators/distributed_ops/split_byref_op.h +++ b/paddle/fluid/operators/collective/alltoall_op.h @@ -1,10 +1,10 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -14,28 +14,27 @@ limitations under the License. */ #pragma once +#include +#include #include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#if defined(PADDLE_WITH_GLOO) +#include "paddle/fluid/framework/fleet/gloo_wrapper.h" +#endif + namespace paddle { namespace operators { -template -class SplitByrefOpKernel : public framework::OpKernel { +template +class AllToAllOpCPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto outs = ctx.MultiOutput("Out"); - auto place = ctx.GetPlace(); - - size_t row_offset = 0; - for (size_t i = 0; i < outs.size(); ++i) { - // NOTE: no need to call mutable_data here to allocate memory. - auto* out = outs[i]; - VLOG(3) << "spliting by ref: " << row_offset << " " << out->dims()[0]; - *out = in->Slice(row_offset, row_offset + out->dims()[0]); - row_offset += out->dims()[0]; - } + PADDLE_THROW(platform::errors::Unavailable( + "Do not support alltoall for cpu kernel now.")); } }; diff --git a/paddle/fluid/operators/collective/c_allgather_op.cc b/paddle/fluid/operators/collective/c_allgather_op.cc index 4111a19c5ebc8cecec02b5a08fbc3337ffa665a1..c4e779698cccafc6d958e823f087a1276b6246c3 100644 --- a/paddle/fluid/operators/collective/c_allgather_op.cc +++ b/paddle/fluid/operators/collective/c_allgather_op.cc @@ -42,6 +42,10 @@ class CAllGatherOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "(Tensor) the allgather result"); AddAttr("ring_id", "(int default 0) communication ring id.") .SetDefault(0); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for all gather.") + .SetDefault("tag"); +#endif AddAttr( "use_calc_stream", "(bool default false) eject CUDA operations to calculation stream.") diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu.cc b/paddle/fluid/operators/collective/c_allgather_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..e7f05549d9efea8579e103826aca6664c8cd9f9b --- /dev/null +++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc @@ -0,0 +1,83 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allgather_op.h" + +#include + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CAllGatherOpASCENDKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + HcclDataType dtype = platform::ToHCCLDataType(in->type()); + + int ring_id = ctx.Attr("ring_id"); + std::string group = + std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); + auto place = ctx.GetPlace(); + auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place); + int nranks = comm->nranks(); + + framework::DDim out_dims = in->dims(); + out_dims[0] *= nranks; + out->mutable_data(out_dims, place); + + uint64_t send_numel = in->numel(); + void *send_buff = reinterpret_cast(const_cast(in->data())); + void *recv_buff = reinterpret_cast(out->data()); + + aclrtStream stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + VLOG(3) << "begin hccl allgather, parameter is: " + << ", group is " << group << ", ring_id is " << ring_id + << ", nranks is " << nranks; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllGather( + send_buff, recv_buff, send_numel, dtype, comm->comm(), + reinterpret_cast(stream))); + +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_allgather, ops::CAllGatherOpASCENDKernel, + ops::CAllGatherOpASCENDKernel, + ops::CAllGatherOpASCENDKernel, + ops::CAllGatherOpASCENDKernel); diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..4c7dfc4aad7d0ec11486d12551d0670d38626579 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc @@ -0,0 +1,192 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_allgather_op.h" +#include "paddle/fluid/operators/collective/c_allreduce_op.h" +#include "paddle/fluid/operators/collective/c_broadcast_op.h" +#include "paddle/fluid/operators/collective/c_reducescatter_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_allgather); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_allgather, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(2) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + + std::vector init; + int rank_id = atoi(getenv("RANK_ID")); + + int num1 = 1; + int num2 = 4; + + for (int64_t i = 0; i < num1 * num2; ++i) { + init.push_back(1.0 + rank_id); + } + PrintDebugInfo("input data", init); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num1, num2}); + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num1, num2}); + tensor_out->mutable_data(place); // allocate + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx"); + attrs["ring_id"] = 0; + attrs["nranks"] = 2; + + auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + for (int i = 0; i < 10; i++) { + op->Run(*scope, place); + } + ctx.Wait(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + + PrintDebugInfo("output data", out_vec); + + EXPECT_EQ(out_vec.size(), init.size() * 2); + for (uint32_t i = 0; i < out_vec.size() / 2; i++) { + EXPECT_EQ(out_vec[i], 1.0); + } + for (uint32_t i = out_vec.size() / 2; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], 2.0); + } +} + +TEST(c_allgather, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + // only support one device, if more than one device, use first default + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHCCLAllGatherOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cc index 835b49e57bc0922a0d0be7895b57275ba31d2173..8bdbdfac8ffd1d8294aca28e90a9b6471c0fc2a9 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cc @@ -37,14 +37,19 @@ class CAllReduceMaxOpMaker : public CAllReduceOpMaker { std::string GetName() const override { return "Max"; } }; +DECLARE_INPLACE_OP_INFERER(AllreduceMaxInplaceInferer, {"X", "Out"}); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_max, ops::CAllReduceOp, - ops::CAllReduceMaxOpMaker); +REGISTER_OPERATOR( + c_allreduce_max, ops::CAllReduceOp, ops::CAllReduceMaxOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + ops::AllreduceMaxInplaceInferer) REGISTER_OP_CPU_KERNEL(c_allreduce_max, ops::CAllReduceOpCPUKernel, diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..4dece4a3721ff5d557e1008bd70a8a8d4a4b1c58 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + c_allreduce_max, ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..b7fd2739d51181cd9bb774a31ecfb108a909f4c1 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc @@ -0,0 +1,188 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_allgather_op.h" +#include "paddle/fluid/operators/collective/c_allreduce_op.h" +#include "paddle/fluid/operators/collective/c_broadcast_op.h" +#include "paddle/fluid/operators/collective/c_reducescatter_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_allreduce_max); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(2) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + + std::vector init; + int rank_id = atoi(getenv("RANK_ID")); + + int num1 = 100; + int num2 = 100; + + for (int64_t i = 0; i < num1 * num2; ++i) { + init.push_back(1.0 + rank_id * 3); + } + PrintDebugInfo("input data", init); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num1, num2}); + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num1, num2}); + tensor_out->mutable_data(place); // allocate + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx"); + attrs["ring_id"] = 0; + + auto op = f::OpRegistry::CreateOp("c_allreduce_max", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + for (int i = 0; i < 10; i++) { + op->Run(*scope, place); + } + ctx.Wait(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + + PrintDebugInfo("output data", out_vec); + + EXPECT_EQ(out_vec.size(), init.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], 4.0); + } +} + +TEST(c_allreduce_max, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + // only support one device, if more than one device, use first default + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHCCLAllReduceOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc similarity index 55% rename from paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc rename to paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc index 9b70f78399026b9f853b8315f0acf6dbad64242a..b0aa51f7cfdfdcd51db6b31a76a6c5c8b77b3d62 100644 --- a/paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,14 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/distributed_ops/allreduce_op.h" +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - allreduce, ops::AllReduceOpKernel, - ops::AllReduceOpKernel, - ops::AllReduceOpKernel, - ops::AllReduceOpKernel, - ops::AllReduceOpKernel); +REGISTER_OP_XPU_KERNEL(c_allreduce_max, + ops::CAllReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cc index efc19659c83ec35c9650d3184654f97d23940745..9d913b12b13767a1375b5b93bbcc483bdbd51a22 100644 --- a/paddle/fluid/operators/collective/c_allreduce_min_op.cc +++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cc @@ -37,14 +37,19 @@ class CAllReduceMinOpMaker : public CAllReduceOpMaker { std::string GetName() const override { return "Min"; } }; +DECLARE_INPLACE_OP_INFERER(AllreduceMinInplaceInferer, {"X", "Out"}); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_min, ops::CAllReduceOp, - ops::CAllReduceMinOpMaker); +REGISTER_OPERATOR( + c_allreduce_min, ops::CAllReduceOp, ops::CAllReduceMinOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + ops::AllreduceMinInplaceInferer) REGISTER_OP_CPU_KERNEL(c_allreduce_min, ops::CAllReduceOpCPUKernel, diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..48e1d2eeb58c5227d57c95d9cb91028cc3266e55 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + c_allreduce_min, ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..2f16a89c217dacb1529ab2f57d300aceabb95a85 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_allreduce_min, + ops::CAllReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 2f56f43d793fa941e96e5711ac48eb2899290259..3a74f551e7a30ed64104f8054a4e063fa816944e 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -19,17 +19,31 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/memory/memory.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/collective_helper.h" +#endif + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/nccl_helper.h" #endif +#if defined(PADDLE_WITH_XPU_BKCL) +#include "paddle/fluid/platform/bkcl_helper.h" +#endif + #if defined(PADDLE_WITH_GLOO) #include #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/hccl_helper.h" +#endif + namespace paddle { namespace operators { @@ -105,6 +119,136 @@ class CAllReduceOpCPUKernel : public framework::OpKernel { } }; +template +class CAllReduceOpASCENDKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + auto place = ctx.GetPlace(); + HcclDataType dtype = platform::ToHCCLDataType(in->type()); + int64_t numel = in->numel(); + + void* sendbuff = reinterpret_cast(const_cast(in->data())); + out->mutable_data(in->dims(), ctx.GetPlace()); + void* recvbuff = reinterpret_cast(out->data()); + + int ring_id = ctx.Attr("ring_id"); + std::string group = + std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); + auto comm = + paddle::platform::HCCLCommContext::Instance().Get(ring_id, place); + + aclrtStream stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + if (ctx.Attr("use_calc_stream")) { + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM; + switch (red_type) { + case kRedSum: + hccl_red_type = HCCL_REDUCE_SUM; + break; + + case kRedMax: + hccl_red_type = HCCL_REDUCE_MAX; + break; + + case kRedMin: + hccl_red_type = HCCL_REDUCE_MIN; + break; + + case kRedProd: + hccl_red_type = HCCL_REDUCE_PROD; + break; + + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid reduce type: %d", red_type)); + } + + VLOG(3) << "begin hccl allreduce, parameter is: " + << "input num: " << numel << "dtype: " << dtype + << "hccl_red_type: " << hccl_red_type << ", group is: " << group; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( + sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(), + reinterpret_cast(stream))); + + out->Resize(in->dims()); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +template +class CAllReduceOpXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_XPU_BKCL) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + BKCLDataType dtype = platform::ToBKCLDataType(in->type()); + int64_t numel = in->numel(); + const void* sendbuff = in->data(); + out->Resize(in->dims()); + void* recvbuff = out->mutable_data(place); + + int rid = ctx.Attr("ring_id"); + auto comm = platform::BKCLCommContext::Instance().Get(rid, place); + + XPUStream stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx) + ->x_context() + ->xpu_stream; + } else { + stream = comm->stream(); + } + + BKCLOp bkcl_red_type = BKCL_ADD; + switch (red_type) { + case kRedSum: + bkcl_red_type = BKCL_ADD; + break; + + case kRedMax: + bkcl_red_type = BKCL_MAX; + break; + + case kRedMin: + bkcl_red_type = BKCL_MIN; + break; + + case kRedProd: + bkcl_red_type = BKCL_PRODUCT; + break; + + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid reduce type: %d", red_type)); + } + + PADDLE_ENFORCE_EQ(bkcl_all_reduce(comm->comm(), sendbuff, recvbuff, numel, + dtype, bkcl_red_type, stream), + BKCL_SUCCESS, platform::errors::PreconditionNotMet( + "BKCL all reduce failed")); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should be compiled with XPU.")); +#endif + } +}; + template class CAllReduceOpCUDAKernel : public framework::OpKernel { public: @@ -170,10 +314,20 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "(Tensor) the allreduced result."); AddAttr("ring_id", "(int default 0) communication ring id.") .SetDefault(0); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for all reduce.") + .SetDefault("tag"); +#endif AddAttr( "use_calc_stream", "(bool default false) eject CUDA operations to calculation stream.") .SetDefault(false); + AddAttr( + "use_model_parallel", + "(bool default false) use this op with model parallel mode. In model " + "parallel mode, the backward is c_identity which returns itself for " + "c_allreduce_sum.") + .SetDefault(false); AddComment(string::Sprintf(R"DOC( CAllReduce %s Operator diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc index 5ab07ef026bac5bef7386b0789803933cd8fdf2a..3ad078e1c8ff0f3438eb1c74ddc82c537a73bf5f 100644 --- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc +++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc @@ -37,14 +37,19 @@ class CAllReduceProdOpMaker : public CAllReduceOpMaker { std::string GetName() const override { return "Prod"; } }; +DECLARE_INPLACE_OP_INFERER(AllreduceProdInplaceInferer, {"X", "Out"}); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_prod, ops::CAllReduceOp, - ops::CAllReduceProdOpMaker); +REGISTER_OPERATOR( + c_allreduce_prod, ops::CAllReduceOp, ops::CAllReduceProdOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + ops::AllreduceProdInplaceInferer) REGISTER_OP_CPU_KERNEL(c_allreduce_prod, ops::CAllReduceOpCPUKernel, diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..f3d14afe0a1bc7228b4cb8517e09633e916c46ac --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + c_allreduce_prod, ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..92ba00428065bc318a48e7e9e63910716015cbf7 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_allreduce_prod, + ops::CAllReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc index 68061e6ae6bea097b7a2bc5ee19d58c05fd21848..18c317506c06e1fa099f872db46990a2155e3e40 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc @@ -37,7 +37,12 @@ class CAllReduceSumOpGradMaker : public framework::SingleGradOpMaker { protected: void Apply(GradOpPtr retv) const override { - retv->SetType("c_allreduce_sum"); + bool use_mp = BOOST_GET_CONST(bool, this->GetAttr("use_model_parallel")); + if (use_mp) { + retv->SetType("c_identity"); + } else { + retv->SetType("c_allreduce_sum"); + } retv->SetInput("X", this->OutputGrad("Out")); retv->SetOutput("Out", this->InputGrad("X")); retv->SetAttrMap(this->Attrs()); @@ -49,6 +54,8 @@ class CAllReduceSumOpMaker : public CAllReduceOpMaker { std::string GetName() const override { return "Sum"; } }; +DECLARE_INPLACE_OP_INFERER(AllreduceSumInplaceInferer, {"X", "Out"}); + } // namespace operators } // namespace paddle @@ -58,7 +65,7 @@ namespace plat = paddle::platform; REGISTER_OPERATOR(c_allreduce_sum, ops::CAllReduceOp, ops::CAllReduceSumOpGradMaker, ops::CAllReduceSumOpGradMaker, - ops::CAllReduceSumOpMaker); + ops::CAllReduceSumOpMaker, ops::AllreduceSumInplaceInferer); REGISTER_OP_CPU_KERNEL(c_allreduce_sum, ops::CAllReduceOpCPUKernel, diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..b66e2e1968908cacfca80dc6b6a1939fb0392c16 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + c_allreduce_sum, ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f1bf9683e35593720e9db604142312a055356bb0 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc @@ -0,0 +1,189 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_allreduce_sum); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(3) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, + int iter) { + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + + int rank_id = atoi(getenv("RANK_ID")); + int num1 = 3; + int num2 = 128; + + std::vector init; + for (int64_t i = 0; i < num1 * num2; ++i) { + init.push_back(1.0 + rank_id); + } + PrintDebugInfo("input data", init); + + auto place = ctx.GetPlace(); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num1, num2}); + ctx.Wait(); + + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num1, num2}); + tensor_out->mutable_data(place); // allocate + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx_" + std::to_string(iter)); + attrs["ring_id"] = 0; + + auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + for (int i = 0; i < 10; i++) { + op->Run(*scope, place); + } + ctx.Wait(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + + PrintDebugInfo("output data", out_vec); + + EXPECT_EQ(out_vec.size(), init.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], 3.0); + } +} + +TEST(c_allreduce_sum, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + // only support one device, if more than one device, use first default + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + for (int i = 0; i < 1; i++) { + VLOG(2) << "iter num: " << i; + TestHCCLAllReduceOp(&scope, ctx, i); + } +} diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..e4ec538cd2323009657ea85e0a5d59db0ea0d3c8 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_allreduce_sum, + ops::CAllReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cc b/paddle/fluid/operators/collective/c_broadcast_op.cc index 928fa8549ffb9209dea975a049db4beed0add6b6..271d543eb2364d4a088291ce6838be3f4d455ff0 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op.cc @@ -42,6 +42,10 @@ class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(0); AddAttr("root", "(int default 0) root id for broadcasting.") .SetDefault(0); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for broadcasting.") + .SetDefault("tag"); +#endif AddAttr( "use_calc_stream", "(bool default false) eject CUDA operations to calculation stream.") diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..a60ba86572822c4390f23dd85ada1db35145ffdb --- /dev/null +++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc @@ -0,0 +1,91 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_broadcast_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CBroadcastOpASCENDKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto x = ctx.Input("X"); + void* ptr = reinterpret_cast(const_cast(x->data())); + int numel = x->numel(); + HcclDataType dtype = platform::ToHCCLDataType(x->type()); + + auto out = ctx.Output("Out"); + + int ring_id = ctx.Attr("ring_id"); + auto place = ctx.GetPlace(); + auto comm = + paddle::platform::HCCLCommContext::Instance().Get(ring_id, place); + + aclrtStream stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + if (ctx.Attr("use_calc_stream")) { + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + int root = ctx.Attr("root"); + std::string group = + std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); + + VLOG(3) << "begin hccl broadcast, parameter is: " + << "root " << root << ", group is " << group + << ", comm: " << comm->comm() << ", stream: " << stream; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast( + ptr, numel, dtype, (uint32_t)root, comm->comm(), stream)); + + VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved " + << framework::product(out->dims()); + + dev_ctx->Wait(); + + if (out != x) { + framework::TensorCopy(*static_cast(x), place, + *platform::DeviceContextPool::Instance().Get(place), + static_cast(out)); + } + dev_ctx->Wait(); + + out->Resize(x->dims()); + out->set_lod(x->lod()); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_broadcast, ops::CBroadcastOpASCENDKernel, + ops::CBroadcastOpASCENDKernel, + ops::CBroadcastOpASCENDKernel, + ops::CBroadcastOpASCENDKernel); diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..9e39613f3fbe3a9277402765c4f4c7a140b9be23 --- /dev/null +++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc @@ -0,0 +1,181 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_broadcast_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_broadcast); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_broadcast, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(2) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + int num = 2; + std::vector init; + int rank_id = atoi(getenv("RANK_ID")); + + for (int64_t i = 0; i < num * num; ++i) { + init.push_back(1.0 + rank_id); + } + PrintDebugInfo("input data", init); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num, num}); + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num, num}); + tensor_out->mutable_data(place); // allocate + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx"); + attrs["root"] = 0; + attrs["ring_id"] = 0; + + auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + for (int i = 0; i < 10; i++) { + op->Run(*scope, place); + } + ctx.Wait(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + + PrintDebugInfo("output data", out_vec); + EXPECT_EQ(out_vec.size(), init.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], 1.0); + } +} + +TEST(c_broadcast, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + // only support one device, if more than one device, use first default + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHCCLBroadcastOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..7817f19bacb1879517d4865165836f46e4b68e75 --- /dev/null +++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc @@ -0,0 +1,96 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace framework { +class Scope; +} // namespace framework +} // namespace paddle +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#endif + +namespace paddle { +namespace operators { + +class CCommInitOpAscend : public framework::OperatorBase { + public: + CCommInitOpAscend(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& place) const override { + PADDLE_ENFORCE_EQ(is_npu_place(place), true, + platform::errors::PreconditionNotMet( + "CCommInitOpAscend can run on npu place only.")); + + auto var = scope.FindVar(Input("X")); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::InvalidArgument("Input con not be empty.")); +#if defined(PADDLE_WITH_ASCEND_CL) + HcclRootInfo* hccl_id = var->GetMutable(); + + int rank_ids = Attr("rank_ids"); + int rank_id = Attr("rank"); + int rid = Attr("ring_id"); + int device_id = BOOST_GET_CONST(platform::NPUPlace, place).device; + if (Attr("device_id") >= 0) { + device_id = Attr("device_id"); + } + platform::HCCLCommContext::Instance().CreateHCCLComm( + hccl_id, rank_ids, rank_id, device_id, rid); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +class CCommInitOpAscendMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Raw variable contains a NCCL UniqueId instaces."); + AddComment(R"DOC( +CCommInit operator + +Initialize collective communicatoin context within this trainer +)DOC"); + AddAttr("rank_ids", + "(int) The number of ranks of distributed trainers"); + AddAttr("rank", + "(int) The rank of the trainer in distributed training."); + AddAttr("device_id", + "(int) The deivce_id on which to initialize the communicator." + "Now, you only have to set this attr manually for pipeline " + "training. Otherwise, make it as default.") + .SetDefault(-1); + AddAttr("ring_id", "(int default 0) user specified ring id") + .SetDefault(0); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(c_comm_init_hccl, ops::CCommInitOpAscend, + ops::CCommInitOpAscendMaker); diff --git a/paddle/fluid/operators/collective/c_concat_op.cc b/paddle/fluid/operators/collective/c_concat_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..551fde21162582fbbb2b356a2aa265247a4af94d --- /dev/null +++ b/paddle/fluid/operators/collective/c_concat_op.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_concat_op.h" + +namespace paddle { +namespace operators { + +class CConcatOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_concat"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_concat"); + int nranks = ctx->Attrs().Get("nranks"); + int rank = ctx->Attrs().Get("rank"); + int ring_id = ctx->Attrs().Get("ring_id"); + PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument( + "The number of ranks (%d) for c_concat " + "must be greater than 1.", + nranks)); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for c_concat must be non-negative.", ring_id)); + PADDLE_ENFORCE_GE( + rank, 0, platform::errors::InvalidArgument( + "The rank (%d) for c_concat must be non-negative.", rank)); + PADDLE_ENFORCE_LT(rank, nranks, + platform::errors::InvalidArgument( + "The value of rank (%d) for c_concat must " + "be less than that of nranks.", + rank, nranks)); + + framework::DDim dim = ctx->GetInputDim("X"); + dim[dim.size() - 1] = dim[dim.size() - 1] * nranks; + if (dim[dim.size() - 1] < 0) dim[dim.size() - 1] = -1; + ctx->SetOutputDim("Out", dim); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } +}; + +template +class CConcatOpGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr retv) const override { + retv->SetType("c_split"); + retv->SetInput("X", this->OutputGrad("Out")); + retv->SetOutput("Out", this->InputGrad("X")); + retv->SetAttrMap(this->Attrs()); + } +}; + +class CConcatOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor) tensor to be concated."); + AddOutput("Out", "(Tensor) the result of concat."); + AddAttr("rank", "(int default 0) rank id.").SetDefault(0); + AddAttr("nranks", "(int default 1) number of ranks.").SetDefault(1); + AddAttr("ring_id", "(int default 0) ring id.").SetDefault(0); + AddAttr( + "use_calc_stream", + "(bool default true) eject CUDA operations to calculation stream.") + .SetDefault(true); + AddAttr("use_model_parallel", + "(bool default true) use this op with model parallel.") + .SetDefault(true); + AddComment(R"DOC( +CConcat Operator +AllGather the tensors on different trainers and concat them along the last dimension. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR(c_concat, ops::CConcatOp, + ops::CConcatOpGradMaker, + ops::CConcatOpGradMaker, + ops::CConcatOpMaker); + +REGISTER_OP_CPU_KERNEL(c_concat, ops::CConcatOpCPUKernel, + ops::CConcatOpCPUKernel, + ops::CConcatOpCPUKernel, + ops::CConcatOpCPUKernel, + ops::CConcatOpCPUKernel); diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..bfdc49c440aae76a2aa9cebae82a419b471f4662 --- /dev/null +++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc @@ -0,0 +1,110 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/operators/collective/c_concat_op.h" +#include "paddle/fluid/operators/math/concat_and_split.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CConcatOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + ncclDataType_t dtype = platform::ToNCCLDataType(x->type()); + + int nranks = ctx.Attr("nranks"); + int rank = ctx.Attr("rank"); + int rid = ctx.Attr("ring_id"); + auto place = ctx.GetPlace(); + PADDLE_ENFORCE_GE(rank, 0, + platform::errors::PreconditionNotMet( + "The value of rank (%d) for c_concat must be " + "greater than or equal to 0.", + rank)); + PADDLE_ENFORCE_GE(nranks, 2, + platform::errors::PreconditionNotMet( + "The value of nranks (%d) for c_concat must be " + "greater than or equal to 2.", + nranks)); + PADDLE_ENFORCE_LT(rank, nranks, + platform::errors::PreconditionNotMet( + "The value of rank (%d) for c_concat must be " + "less than that of nranks (%d).", + rank, nranks)); + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto comm = platform::NCCLCommContext::Instance().Get(rid, place); + PADDLE_ENFORCE_EQ( + nranks, comm->nranks(), + platform::errors::InvalidArgument("nranks: %s should equal to %s", + nranks, comm->nranks())); + + framework::Tensor temp_out; + framework::DDim temp_out_dims = x->dims(); + temp_out_dims[0] *= nranks; + temp_out.mutable_data(temp_out_dims, place); + int64_t send_numel = x->numel(); + const T* send_buff = x->data(); + T* recv_buff = temp_out.data(); + gpuStream_t stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather( + send_buff, recv_buff, send_numel, static_cast(dtype), + comm->comm(), stream)); + + std::vector inputs; + int axis = x->dims().size() - 1; + auto out_dims = x->dims(); + out_dims[out_dims.size() - 1] *= nranks; + int rows_per_tensor = x->dims()[0]; + int offset = 0; + for (int i = 0; i < nranks; i++) { + framework::Tensor temp = temp_out.Slice(offset, offset + rows_per_tensor); + inputs.emplace_back(temp); + offset += rows_per_tensor; + } + + math::ConcatFunctor functor; + out->mutable_data(out_dims, place); + auto& dev_ctx2 = ctx.template device_context(); + functor(dev_ctx2, inputs, axis, out); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with GPU.")); +#endif + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(c_concat, ops::CConcatOpCUDAKernel, + ops::CConcatOpCUDAKernel, + ops::CConcatOpCUDAKernel, + ops::CConcatOpCUDAKernel, + ops::CConcatOpCUDAKernel); diff --git a/paddle/fluid/operators/collective/c_concat_op.h b/paddle/fluid/operators/collective/c_concat_op.h new file mode 100644 index 0000000000000000000000000000000000000000..55a5799e37b6f5728793d0a03cb7e7a01558f2cb --- /dev/null +++ b/paddle/fluid/operators/collective/c_concat_op.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class CConcatOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support c_concat for cpu kernel now.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..593eaf923a978402cc7607bb7d2bc4a6419dd2cb --- /dev/null +++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc @@ -0,0 +1,111 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#include "glog/logging.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#endif + +namespace paddle { +namespace operators { + +#ifdef PADDLE_WITH_ASCEND_CL + +class CGenHCCLIdOp : public framework::OperatorBase { + public: + CGenHCCLIdOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + int rank = Attr("rank"); + framework::Scope& local_scope = scope.NewScope(); + + std::function func = [&](size_t i) -> std::string { + return Output("Out"); + }; + + if (rank == 0) { + std::vector endpoint_list = + Attr>("other_endpoints"); + SendBroadCastHCCLID(endpoint_list, 1, func, local_scope); + } else { + std::string endpoint = Attr("endpoint"); + RecvBroadCastHCCLID(endpoint, 1, func, local_scope); + } + scope.DeleteScope(&local_scope); + } +}; + +#else + +class CGenHCCLIdOp : public framework::OperatorBase { + public: + CGenHCCLIdOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override {} +}; + +#endif + +class CGenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + VLOG(3) << "ele"; + AddOutput("Out", "Raw variable contains a HCCL UniqueId instaces."); + AddComment(R"DOC( +CGenHCCLId operator + +For trainer 0: generate a new UniqueId and send it to all the other trainers. +For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server. +)DOC"); + AddAttr("endpoint", + "(string), e.g. 127.0.0.1:6175 " + "current listen endpoint"); + AddAttr>( + "other_endpoints", + "['trainer1_ip:port', 'trainer2_ip:port', ...] " + "list of other trainer endpoints") + .SetDefault({}); + AddAttr("rank", + "(int default 0) " + "The rank of the trainer in distributed training.") + .SetDefault(0); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(c_gen_hccl_id, ops::CGenHCCLIdOp, ops::CGenHCCLIdOpMaker); diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc index 485a6d7ec4ed3575b6a35b74869d611a6678e2c5..470537582e97838322de2dabdd880b254d6401c9 100644 --- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc @@ -27,6 +27,7 @@ limitations under the License. */ namespace paddle { namespace operators { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) static void GenNCCLID(std::vector* nccl_ids) { for (size_t i = 0; i < nccl_ids->size(); ++i) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -65,6 +66,9 @@ class CGenNCCLIdOp : public framework::OperatorBase { return Output("Out"); }; + std::string endpoint = Attr("endpoint"); + int server_fd = platform::SocketServer::GetInstance(endpoint).socket(); + std::vector nccl_ids; nccl_ids.resize(1); @@ -74,8 +78,7 @@ class CGenNCCLIdOp : public framework::OperatorBase { Attr>("other_endpoints"); platform::SendBroadCastCommID(endpoint_list, &nccl_ids); } else { - std::string endpoint = Attr("endpoint"); - platform::RecvBroadCastCommID(endpoint, &nccl_ids); + platform::RecvBroadCastCommID(server_fd, endpoint, &nccl_ids); } CopyNCCLIDToVar(nccl_ids, func, scope); @@ -83,6 +86,21 @@ class CGenNCCLIdOp : public framework::OperatorBase { } }; +#else +class CGenNCCLIdOp : public framework::OperatorBase { + public: + CGenNCCLIdOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override {} +}; + +#endif + class CGenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { diff --git a/paddle/fluid/operators/collective/c_identity_op.cc b/paddle/fluid/operators/collective/c_identity_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..646c27b90e17ea316d31ae2199d8433b855efdd4 --- /dev/null +++ b/paddle/fluid/operators/collective/c_identity_op.cc @@ -0,0 +1,92 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_identity_op.h" + +namespace paddle { +namespace operators { + +class CIdentityOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_identity"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_identity"); + int ring_id = ctx->Attrs().Get("ring_id"); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for c_identity must be non-negative.", ring_id)); + framework::DDim dim = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", dim); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } +}; + +class CIdentityOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor) identity tensor."); + AddOutput("Out", "(Tensor) identity tensor."); + AddAttr("ring_id", "(int default 0) nccl communication ring id.") + .SetDefault(0); + AddAttr( + "use_calc_stream", + "(bool default true) eject CUDA operations to calculation stream.") + .SetDefault(true); + AddAttr("use_model_parallel", + "(bool default true) use this op with model parallel.") + .SetDefault(true); + AddComment(R"DOC( +Identity Operator which returns a copy of itself. +)DOC"); + } +}; + +template +class CIdentityOpGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr retv) const override { + retv->SetType("c_allreduce_sum"); + retv->SetInput("X", this->OutputGrad("Out")); + retv->SetOutput("Out", this->InputGrad("X")); + retv->SetAttrMap(this->Attrs()); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR(c_identity, ops::CIdentityOp, + ops::CIdentityOpGradMaker, + ops::CIdentityOpGradMaker, + ops::CIdentityOpMaker); + +REGISTER_OP_CPU_KERNEL(c_identity, ops::CIdentityOpCPUKernel, + ops::CIdentityOpCPUKernel, + ops::CIdentityOpCPUKernel, + ops::CIdentityOpCPUKernel, + ops::CIdentityOpCPUKernel); diff --git a/paddle/fluid/operators/collective/c_identity_op.cu.cc b/paddle/fluid/operators/collective/c_identity_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..05bb3830b601fbb6cb9be38de258b56776fafad4 --- /dev/null +++ b/paddle/fluid/operators/collective/c_identity_op.cu.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_identity_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(c_identity, ops::CIdentityOpKernel, + ops::CIdentityOpKernel, + ops::CIdentityOpKernel, + ops::CIdentityOpKernel, + ops::CIdentityOpKernel); diff --git a/paddle/fluid/operators/collective/c_identity_op.h b/paddle/fluid/operators/collective/c_identity_op.h new file mode 100644 index 0000000000000000000000000000000000000000..c8577a9617489887167dbc7d9ae008608f1be48e --- /dev/null +++ b/paddle/fluid/operators/collective/c_identity_op.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class CIdentityOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support c_identity for cpu kernel now.")); + } +}; + +template +class CIdentityOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + + int rid = ctx.Attr("ring_id"); + PADDLE_ENFORCE_GE( + rid, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for c_identity op must be non-negative.", rid)); + out->mutable_data(ctx.GetPlace()); + + TensorCopy(*x, out->place(), out); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/c_identity_op_npu.cc b/paddle/fluid/operators/collective/c_identity_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..a822bd11a4a8332111d6c0813a377fa214a0c390 --- /dev/null +++ b/paddle/fluid/operators/collective/c_identity_op_npu.cc @@ -0,0 +1,21 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_identity_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_identity, ops::CIdentityOpKernel, + ops::CIdentityOpKernel, + ops::CIdentityOpKernel, + ops::CIdentityOpKernel, + ops::CIdentityOpKernel); diff --git a/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..f35b4c2f707226f38c37c350dcb6a76c160ff50f --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_reduce_max, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc similarity index 54% rename from paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc rename to paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc index 056659c3ea61f6233a6dda56ca1e272e72770d4a..6d3af7bb5f258b425a8618e412c3bb5552113bfe 100644 --- a/paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc +++ b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc @@ -1,10 +1,10 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -12,8 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/distributed_ops/split_byref_op.h" +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - split_byref, - ops::SplitByrefOpKernel); +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_reduce_max, + ops::CReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..6ebb7e4c40e68e961059fdd41566492a5274bc31 --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_reduce_min, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..791e58d8493cec454e2fe74c772ec70999cc8f36 --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_reduce_min, + ops::CReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h index 1bce01e13a2ad25638128f4f619f458348d97b5e..fa9fd079d8e48b053deaa12fb3d61b0c00713b22 100644 --- a/paddle/fluid/operators/collective/c_reduce_op.h +++ b/paddle/fluid/operators/collective/c_reduce_op.h @@ -24,15 +24,28 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" +#endif + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/nccl_helper.h" #endif + +#if defined(PADDLE_WITH_XPU_BKCL) +#include "paddle/fluid/platform/bkcl_helper.h" +#endif + #if defined(PADDLE_WITH_GLOO) #include #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/hccl_helper.h" +#endif + namespace paddle { namespace operators { @@ -110,6 +123,148 @@ class CReduceOpCPUKernel : public framework::OpKernel { } }; +template +class CReduceOpASCENDKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + auto place = ctx.GetPlace(); + HcclDataType dtype = platform::ToHCCLDataType(in->type()); + int64_t numel = in->numel(); + + void* sendbuff = reinterpret_cast(const_cast(in->data())); + void* recvbuff = reinterpret_cast(out->data()); + + int ring_id = ctx.Attr("ring_id"); + int root_id = ctx.Attr("root_id"); + std::string group = + std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); + auto comm = + paddle::platform::HCCLCommContext::Instance().Get(ring_id, place); + + aclrtStream stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + if (ctx.Attr("use_calc_stream")) { + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + int rank_id = comm->rank(); + + HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM; + switch (red_type) { + case kRedSum: + hccl_red_type = HCCL_REDUCE_SUM; + break; + + case kRedMax: + hccl_red_type = HCCL_REDUCE_MAX; + break; + + case kRedMin: + hccl_red_type = HCCL_REDUCE_MIN; + break; + + case kRedProd: + hccl_red_type = HCCL_REDUCE_PROD; + break; + + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid reduce type: %d", red_type)); + } + + VLOG(3) << "begin hccl reduce, parameter is: " + << "input num: " << numel << "root_id: " << root_id + << "dtype: " << dtype << "hccl_red_type: " << hccl_red_type + << ", group is: " << group; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( + sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(), + reinterpret_cast(stream))); + + if (rank_id != root_id) { + auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place); + memory::Copy(npu_place, reinterpret_cast(out->data()), + npu_place, + reinterpret_cast(const_cast(in->data())), + numel * sizeof(T), stream); + } + + out->Resize(in->dims()); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +template +class CReduceOpXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_XPU_BKCL) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + BKCLDataType dtype = platform::ToBKCLDataType(in->type()); + int64_t numel = in->numel(); + const void* sendbuff = in->data(); + out->Resize(in->dims()); + void* recvbuff = out->mutable_data(place); + + int rid = ctx.Attr("ring_id"); + int root = ctx.Attr("root_id"); + auto comm = platform::BKCLCommContext::Instance().Get(rid, place); + + XPUStream stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx) + ->x_context() + ->xpu_stream; + } else { + stream = comm->stream(); + } + + BKCLOp bkcl_red_type = BKCL_ADD; + switch (red_type) { + case kRedSum: + bkcl_red_type = BKCL_ADD; + break; + + case kRedMax: + bkcl_red_type = BKCL_MAX; + break; + + case kRedMin: + bkcl_red_type = BKCL_MIN; + break; + + case kRedProd: + bkcl_red_type = BKCL_PRODUCT; + break; + + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid reduce type: %d", red_type)); + } + + PADDLE_ENFORCE_EQ(bkcl_reduce(comm->comm(), sendbuff, recvbuff, numel, + dtype, bkcl_red_type, root, stream), + BKCL_SUCCESS, platform::errors::PreconditionNotMet( + "BKCL all reduce failed")); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should be compiled with XPU.")); +#endif + } +}; + template class CReduceOpCUDAKernel : public framework::OpKernel { public: @@ -179,6 +334,10 @@ class CReduceOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "(Tensor) the reduced result."); AddAttr("ring_id", "(int default 0) communication ring id.") .SetDefault(0); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for reduce.") + .SetDefault("tag"); +#endif AddAttr("root_id", "(int default 0) root id.").SetDefault(0); AddAttr( "use_calc_stream", diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..f0b7021e7997d9298bf350b387cd3521a4eb035d --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_reduce_prod, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..e7e770e8ffdcaf36408a13c7e01a31ebd1eced20 --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_reduce_prod, + ops::CReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..dd4dbbd5f364575258ca23991bda68482cdca0f3 --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_reduce_sum, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..3683c7722ba3bfbfbd12dbf7b1a1688fa7446708 --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc @@ -0,0 +1,192 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_reduce_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_reduce_sum); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_reduce_sum, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(3) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) { + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + + int rank_id = atoi(getenv("RANK_ID")); + int num1 = 3; + int num2 = 128; + + std::vector init; + for (int64_t i = 0; i < num1 * num2; ++i) { + init.push_back(1.0 + rank_id); + } + PrintDebugInfo("input data", init); + + auto place = ctx.GetPlace(); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num1, num2}); + ctx.Wait(); + + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num1, num2}); + tensor_out->mutable_data(place); // allocate + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx_" + std::to_string(iter)); + attrs["ring_id"] = 0; + int root_id = 0; + attrs["root_id"] = root_id; + + auto op = f::OpRegistry::CreateOp("c_reduce_sum", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + op->Run(*scope, place); + ctx.Wait(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + + PrintDebugInfo("output data", out_vec); + + EXPECT_EQ(out_vec.size(), init.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + if (rank_id == root_id) { + EXPECT_EQ(out_vec[i], 3.0); + } else { + EXPECT_EQ(out_vec[i], init[i]); + } + } +} + +TEST(c_reduce_sum, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + // only support one device, if more than one device, use first default + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + for (int i = 0; i < 2; i++) { + VLOG(2) << "iter num: " << i; + TestHCCLReduceOp(&scope, ctx, i); + } +} diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..a0ec4d2a99cd711f315186f7ce8966585685aeef --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_reduce_sum, + ops::CReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cc index ada1fd2b1270ccbd530a5a420248bb12d2707ffd..7836f11dc9b1fb9e6bb426e07450098ca3667535 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op.cc +++ b/paddle/fluid/operators/collective/c_reducescatter_op.cc @@ -49,6 +49,10 @@ class CReduceScatterOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("nranks", "Total trainer count of the distributed training job") .SetDefault(1); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for reduce scatter.") + .SetDefault("tag"); +#endif AddAttr( "use_calc_stream", "(bool default false) eject CUDA operations to calculation stream.") diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.h b/paddle/fluid/operators/collective/c_reducescatter_op.h index 366d8a3747cfb7a88c697100ed4a49af00ee06e6..490b152bc2d302c29701a7cadfe91f2dc5bd8c45 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op.h +++ b/paddle/fluid/operators/collective/c_reducescatter_op.h @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/collective/c_allreduce_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..44096a82c34d61bc75fc494145d7d52e63cd0aa3 --- /dev/null +++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc @@ -0,0 +1,87 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reducescatter_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CReduceScatterOpAscendKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + + int ring_id = ctx.Attr("ring_id"); + std::string group = + std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); + auto place = ctx.GetPlace(); + auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place); + int nranks = comm->nranks(); + + auto out_dims = in->dims(); + PADDLE_ENFORCE_EQ(out_dims[0] % nranks, 0, + platform::errors::InvalidArgument( + "The input tensor X's " + "dim[0] (%d) should be divisible by nranks(%d)", + out_dims[0], nranks)); + + out_dims[0] = out_dims[0] / nranks; + out->mutable_data(out_dims, place); + + uint64_t recv_numel = in->numel() / nranks; + + void* inputPtr = reinterpret_cast(const_cast(in->data())); + void* outputPtr = reinterpret_cast(out->data()); + HcclDataType dtype = platform::ToHCCLDataType(in->type()); + + aclrtStream stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + VLOG(3) << "begin hccl reduce scatter, parameter is: " + << "recv_numel: " << recv_numel << "dtype: " << dtype + << "hccl_red_type: " << HCCL_REDUCE_SUM << ", group is: " << group; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclReduceScatter( + inputPtr, outputPtr, recv_numel, dtype, HCCL_REDUCE_SUM, comm->comm(), + reinterpret_cast(stream))); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_reducescatter, + ops::CReduceScatterOpAscendKernel, + ops::CReduceScatterOpAscendKernel, + ops::CReduceScatterOpAscendKernel, + ops::CReduceScatterOpAscendKernel); diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f82f050a7206fe73f110df8f78989ad6181de84d --- /dev/null +++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc @@ -0,0 +1,189 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_allgather_op.h" +#include "paddle/fluid/operators/collective/c_allreduce_op.h" +#include "paddle/fluid/operators/collective/c_broadcast_op.h" +#include "paddle/fluid/operators/collective/c_reducescatter_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_reducescatter); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_reducescatter, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(2) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + + std::vector init; + int num1 = 4; + int num2 = 1; + + for (int64_t i = 0; i < num1 * num2; ++i) { + init.push_back(1.0); + } + PrintDebugInfo("input data", init); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num1, num2}); + + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num1, num2}); + tensor_out->mutable_data(place); // allocate + + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx"); + attrs["ring_id"] = 0; + attrs["nranks"] = 2; + + auto op = f::OpRegistry::CreateOp("c_reducescatter", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + int iter_num = 10; + for (int i = 0; i < iter_num; i++) { + op->Run(*scope, place); + ctx.Wait(); + } + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + + PrintDebugInfo("output data", out_vec); + EXPECT_EQ(out_vec.size(), init.size() / 2); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], 2.0); + } +} + +TEST(c_reducescatter, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + // only support one device, if more than one device, use first default + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHCCLReduceScatterOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/collective/c_split_op.cc b/paddle/fluid/operators/collective/c_split_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..03046d571d0f0542ff714868205d5a0aa285e685 --- /dev/null +++ b/paddle/fluid/operators/collective/c_split_op.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_split_op.h" + +namespace paddle { +namespace operators { + +class CSplitOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_split"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_split"); + int nranks = ctx->Attrs().Get("nranks"); + int rank = ctx->Attrs().Get("rank"); + int ring_id = ctx->Attrs().Get("ring_id"); + PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument( + "The number of ranks (%d) for c_split " + "must be greater than 1.", + nranks)); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for c_split must be non-negative.", ring_id)); + PADDLE_ENFORCE_GE( + rank, 0, platform::errors::InvalidArgument( + "The rank (%d) for c_split must be non-negative.", rank)); + PADDLE_ENFORCE_LT(rank, nranks, + platform::errors::InvalidArgument( + "The value of rank (%d) for c_split must " + "be less than that of nranks.", + rank, nranks)); + + framework::DDim dim = ctx->GetInputDim("X"); + dim[dim.size() - 1] = dim[dim.size() - 1] / nranks; + if (dim[0] < 0) dim[0] = -1; + ctx->SetOutputDim("Out", dim); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } +}; + +template +class CSplitOpGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr retv) const override { + retv->SetType("c_allgather"); + retv->SetInput("X", this->OutputGrad("Out")); + retv->SetOutput("Out", this->InputGrad("X")); + retv->SetAttrMap(this->Attrs()); + } +}; + +class CSplitOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor) tensor to be split."); + AddOutput("Out", "(Tensor) the result of split."); + AddAttr("rank", "(int default 0) rank id.").SetDefault(0); + AddAttr("nranks", "(int default 1) number of ranks.").SetDefault(1); + AddAttr("ring_id", "(int default 0) ring id.").SetDefault(0); + AddAttr( + "use_calc_stream", + "(bool default false) eject CUDA operations to calculation stream.") + .SetDefault(false); + AddAttr("use_model_parallel", + "(bool default false) use this op with model parallel.") + .SetDefault(true); + AddComment(R"DOC( +CSplit Operator +Split the tensor evenly according to its rank. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR(c_split, ops::CSplitOp, + ops::CSplitOpGradMaker, + ops::CSplitOpGradMaker, + ops::CSplitOpMaker); + +REGISTER_OP_CPU_KERNEL(c_split, ops::CSplitOpCPUKernel, + ops::CSplitOpCPUKernel, + ops::CSplitOpCPUKernel, + ops::CSplitOpCPUKernel, + ops::CSplitOpCPUKernel); diff --git a/paddle/fluid/operators/collective/c_split_op.cu.cc b/paddle/fluid/operators/collective/c_split_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..92a7f5e41b1d2d8a1e3f4582ad014f630010c8ca --- /dev/null +++ b/paddle/fluid/operators/collective/c_split_op.cu.cc @@ -0,0 +1,80 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/operators/collective/c_split_op.h" +#include "paddle/fluid/operators/math/concat_and_split.h" + +namespace paddle { +namespace operators { + +template +class CSplitOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + + int nranks = ctx.Attr("nranks"); + int rank = ctx.Attr("rank"); + auto place = ctx.GetPlace(); + + PADDLE_ENFORCE_GE(rank, 0, platform::errors::PreconditionNotMet( + "The value of rank (%d) for c_split must be " + "greater than or equal to 0.", + rank)); + PADDLE_ENFORCE_GE(nranks, 2, + platform::errors::PreconditionNotMet( + "The value of nranks (%d) for c_split must be " + "greater than or equal to 2.", + nranks)); + PADDLE_ENFORCE_LT(rank, nranks, + platform::errors::PreconditionNotMet( + "The value of rank (%d) for c_split must be " + "less than that of nranks (%d).", + rank, nranks)); + + auto& dev_ctx = ctx.template device_context(); + std::vector shape_refer; + std::vector results; + size_t numel = x->numel(); + auto dims = x->dims(); + numel /= nranks; + int axis = dims.size() - 1; + dims[dims.size() - 1] /= nranks; + for (int i = 0; i < nranks; i++) { + framework::Tensor* out = new framework::Tensor(); + out->mutable_data(dims, place); + shape_refer.emplace_back(out); + results.emplace_back(out); + } + + math::SplitFunctor functor; + functor(dev_ctx, *x, shape_refer, axis, &results); + out->mutable_data(dims, place); + paddle::framework::TensorCopySync(*results[rank], out->place(), out); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(c_split, ops::CSplitOpCUDAKernel, + ops::CSplitOpCUDAKernel, + ops::CSplitOpCUDAKernel, + ops::CSplitOpCUDAKernel, + ops::CSplitOpCUDAKernel); diff --git a/paddle/fluid/operators/collective/c_split_op.h b/paddle/fluid/operators/collective/c_split_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ea0c7fc45c66b8bfeb51d88999f57c1e94eb2ab8 --- /dev/null +++ b/paddle/fluid/operators/collective/c_split_op.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class CSplitOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support c_split for cpu kernel now.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc index c4abe284d720963deecaabdb30f96f9b55e4753b..71ab25a7b0ff8a490d7de0022f810009a58482d4 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc @@ -15,40 +15,20 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -} // namespace paddle - namespace paddle { namespace operators { -class CSyncCalcStreamOp : public framework::OperatorBase { +class CSyncCalcStreamOp : public framework::OperatorWithKernel { public: - CSyncCalcStreamOp(const std::string& type, - const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - void RunImpl(const framework::Scope& scope, - const platform::Place& place) const override { - PADDLE_ENFORCE_EQ(is_gpu_place(place), true, - platform::errors::PreconditionNotMet( - "Sync stream op can run on gpu place only for now.")); -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) - auto dev_ctx = static_cast( - platform::DeviceContextPool::Instance().Get(place)); -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(dev_ctx->stream())); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream())); -#endif -#else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should compile with GPU.")); -#endif + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override {} + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::FP32, + ctx.GetPlace()); } }; @@ -65,10 +45,47 @@ Call calculation stream synchronization. } }; +template +class CSyncCalcStreamKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) + + auto place = ctx.GetPlace(); + auto dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(dev_ctx->stream())); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream())); +#endif + +#elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32) + auto place = ctx.GetPlace(); + PADDLE_ENFORCE_EQ(is_npu_place(place), true, + platform::errors::PreconditionNotMet( + "Sync stream op can run on npu place only for now.")); + + auto dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(dev_ctx->stream())); + +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with GPU.")); +#endif + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(c_sync_calc_stream, ops::CSyncCalcStreamOp, - ops::CSyncCalcStreamOpMaker); +REGISTER_OP_WITHOUT_GRADIENT(c_sync_calc_stream, ops::CSyncCalcStreamOp, + ops::CSyncCalcStreamOpMaker); + +REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel); + +REGISTER_OP_NPU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel); diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..45613715b8260c3f38968e5cd91f245cd9f524d5 --- /dev/null +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc @@ -0,0 +1,107 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include + +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(elementwise_add); +USE_OP_DEVICE_KERNEL(elementwise_add, NPU); +USE_OP_DEVICE_KERNEL(c_sync_calc_stream, NPU); + +template +void Compare(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("X"); + auto tensor_x = x->GetMutable(); + + auto y = scope->Var("Y"); + auto tensor_y = y->GetMutable(); + + std::vector init_x; + for (int64_t i = 0; i < 10 * 10; ++i) { + init_x.push_back(static_cast(1.0)); + } + + std::vector init_y; + for (int64_t i = 0; i < 10 * 10; ++i) { + init_y.push_back(static_cast(2.0)); + } + + TensorFromVector(init_x, ctx, tensor_x); + tensor_x->Resize({10, 10}); + TensorFromVector(init_y, ctx, tensor_y); + tensor_y->Resize({10, 10}); + + f::AttributeMap attrs; + auto place = ctx.GetPlace(); + auto out = scope->Var("Out"); + auto tensor_out = out->GetMutable(); + + // sync data + auto sync_op0 = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}}, + {{"Out", {"Out"}}}, attrs); + sync_op0->Run(*scope, place); + + // run + + auto op = + f::OpRegistry::CreateOp("elementwise_add", {{"X", {"X"}}, {"Y", {"Y"}}}, + {{"Out", {"Out"}}}, attrs); + + op->Run(*scope, place); + + // sync op run + auto sync_op = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}}, + {{"Out", {"Out"}}}, attrs); + sync_op->Run(*scope, place); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + // sync op copy + auto sync_op2 = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}}, + {{"Out", {"Out"}}}, attrs); + sync_op2->Run(*scope, place); + + float expected = 3.0; + + EXPECT_EQ(out_vec.size(), init_x.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], static_cast(expected)); + } +} + +TEST(c_sync_calc_stream, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx); +} diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc index adf27069f524e45c52ba30d9ee3e6920c7ea7751..71fda2cd01c8d6007cab19ebeea365467e8e7a99 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc @@ -14,45 +14,30 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -} // namespace paddle + #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +#if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" #endif namespace paddle { namespace operators { -class CSyncCommStreamOp : public framework::OperatorBase { +class CSyncCommStreamOp : public framework::OperatorWithKernel { public: - CSyncCommStreamOp(const std::string& type, - const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - void RunImpl(const framework::Scope& scope, - const platform::Place& place) const override { - PADDLE_ENFORCE_EQ(is_gpu_place(place), true, - platform::errors::PreconditionNotMet( - "Sync stream op can run on gpu place only for now.")); + using framework::OperatorWithKernel::OperatorWithKernel; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - int ring_id = Attr("ring_id"); - auto stream = - platform::NCCLCommContext::Instance().Get(ring_id, place)->stream(); -#ifdef PADDLE_WITH_RCCL - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); -#endif -#else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should compile with GPU.")); -#endif + void InferShape(framework::InferShapeContext* ctx) const override {} + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::FP32, + ctx.GetPlace()); } }; @@ -72,10 +57,46 @@ Call communication stream synchronization. } }; +template +class CSyncCommStreamKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto place = ctx.GetPlace(); +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + int ring_id = ctx.Attr("ring_id"); + auto stream = + platform::NCCLCommContext::Instance().Get(ring_id, place)->stream(); + +#ifdef PADDLE_WITH_RCCL + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); +#endif + +#elif defined(PADDLE_WITH_ASCEND_CL) + PADDLE_ENFORCE_EQ(is_npu_place(place), true, + platform::errors::PreconditionNotMet( + "Sync stream op can run on npu place only for now.")); + int ring_id = ctx.Attr("ring_id"); + auto stream = + platform::HCCLCommContext::Instance().Get(ring_id, place)->stream(); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream)); + +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with GPU.")); +#endif + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(c_sync_comm_stream, ops::CSyncCommStreamOp, - ops::CSyncCommStreamOpMaker); +REGISTER_OP_WITHOUT_GRADIENT(c_sync_comm_stream, ops::CSyncCommStreamOp, + ops::CSyncCommStreamOpMaker); + +REGISTER_OP_CUDA_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel); + +REGISTER_OP_NPU_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel); diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6c5a6db61483dcd7e3578ded6a12a8a421ca1933 --- /dev/null +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc @@ -0,0 +1,190 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include + +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_broadcast_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_broadcast); +USE_OP_DEVICE_KERNEL(c_sync_comm_stream, NPU); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_broadcast, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(2) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) { + std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl; + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + int num = 2; + std::vector init; + int rank_id = atoi(getenv("RANK_ID")); + std::cout << "rank_id:" << rank_id << std::endl; + for (int64_t i = 0; i < num * num; ++i) { + init.push_back(1.0 + rank_id); + std::cout << init[0]; + } + std::cout << std::endl; + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num, num}); + + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num, num}); + tensor_out->mutable_data(place); // allocate + + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx"); + attrs["root"] = 0; + attrs["ring_id"] = 0; + + auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + op->Run(*scope, place); + + // comm sync + + auto sync_op = f::OpRegistry::CreateOp( + "c_sync_comm_stream", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs); + sync_op->Run(*scope, place); + + // ctx.Wait(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + EXPECT_EQ(out_vec.size(), init.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], 1.0); + } +} + +TEST(c_sync_comm_stream_op, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + // only support one device, if more than one device, use first default + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHCCLBroadcastOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op.cc b/paddle/fluid/operators/collective/gen_hccl_id_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0cb2dd188725f8f582a20318fbe4845b39047d3e --- /dev/null +++ b/paddle/fluid/operators/collective/gen_hccl_id_op.cc @@ -0,0 +1,216 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "glog/logging.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/hccl_helper.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/split.h" + +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +namespace paddle { +namespace operators { + +#ifdef PADDLE_WITH_ASCEND_CL + +class GenHCCLIdOp : public framework::OperatorBase { + public: + GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + std::vector trainers = + Attr>("trainers"); + int trainer_id = Attr("trainer_id"); + std::string endpoint = trainers[trainer_id]; + + PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument( + "trainer_id %d is less than 0. Its " + "valid range is [0, trainer_size)")); + PADDLE_ENFORCE_LT( + trainer_id, static_cast(trainers.size()), + platform::errors::OutOfRange("trainer_id %d is out of range. Its valid " + "range is [0, trainer_size)", + trainer_id)); + + int hccl_comm_num = Attr("hccl_comm_num"); + int use_hierarchical_allreduce = Attr("use_hierarchical_allreduce"); + int inter_nranks = Attr("hierarchical_allreduce_inter_nranks"); + int inter_trainer_id = -1; + int exter_trainer_id = -1; + + if (use_hierarchical_allreduce) { + PADDLE_ENFORCE_GT( + trainers.size(), 1, + platform::errors::PreconditionNotMet( + "The number of collective trainers %llu <= 1", trainers.size())); + PADDLE_ENFORCE_GT( + inter_nranks, 1, + platform::errors::PreconditionNotMet( + "inter_nranks %d <= 1 while in hierarchical allreduce mode", + inter_nranks)); + PADDLE_ENFORCE_EQ( + trainers.size() % inter_nranks, 0, + platform::errors::PreconditionNotMet( + "The number of trainers %llu mod inter_nranks %d is not equal 0", + trainers.size(), inter_nranks)); + + inter_trainer_id = trainer_id % inter_nranks; + + if (trainer_id % inter_nranks == 0) { + exter_trainer_id = trainer_id / inter_nranks; + } + } + + std::ostringstream ss; + for (size_t i = 0; i < trainers.size(); i++) { + ss << trainers[i] << ","; + } + + VLOG(1) << "trainer_id:" << trainer_id + << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce + << ", hccl_comm_num:" << hccl_comm_num + << ", inter_nranks:" << inter_nranks + << ", inter_trainer_id:" << inter_trainer_id + << ", exter_trainer_id:" << exter_trainer_id + << ", trainers:" << ss.str(); + + int server_fd = -1; + + /// 1. init flat + std::function func = platform::GetFlatHCCLVarName; + if (trainer_id == 0) { + // server endpoints + std::vector flat_endpoints; + flat_endpoints.insert(flat_endpoints.begin(), trainers.begin() + 1, + trainers.end()); + SendBroadCastHCCLID(flat_endpoints, hccl_comm_num, func, scope); + } else { + server_fd = CreateListenSocket(endpoint); + RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope); + } + + /// 2. hierarchical inter ncclid + func = platform::GetHierarchicalInterHCCLVarName; + if (inter_trainer_id == 0) { + std::ostringstream ss; + ss << endpoint; + std::vector inter_endpoints; + for (int i = trainer_id + 1; i < trainer_id + inter_nranks && + i < static_cast(trainers.size()); + i++) { + ss << ","; + inter_endpoints.push_back(trainers[i]); + ss << trainers[i]; + } + VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str(); + + SendBroadCastHCCLID(inter_endpoints, hccl_comm_num, func, scope); + } else if (inter_trainer_id > 0) { + VLOG(1) << "Hierarchical inter ring"; + RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope); + } + + /// 3. hierarchical exter ncclid + func = platform::GetHierarchicalExterHCCLVarName; + if (exter_trainer_id == 0) { + std::ostringstream ss; + std::vector exter_endpoints; + ss << endpoint; + for (size_t i = inter_nranks; i < trainers.size(); i += inter_nranks) { + ss << ","; + exter_endpoints.push_back(trainers[i]); + ss << trainers[i]; + } + VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str(); + + SendBroadCastHCCLID(exter_endpoints, hccl_comm_num, func, scope); + } else if (exter_trainer_id > 0) { + VLOG(1) << "Hierarchical exter ring"; + RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope); + } + + // close socket server + if (trainer_id != 0) { + CloseSocket(server_fd); + } + } +}; + +#else +class GenHCCLIdOp : public framework::OperatorBase { + public: + GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override {} +}; + +#endif + +class GenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddOutput("HCCLID", "Raw variable contains a HCCL UniqueId instaces."); + AddComment(R"DOC( +GenHCCLId operator + +For trainer 0: generate a new UniqueId and send it to all the other trainers. +For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server. +)DOC"); + AddAttr>( + "trainers", + "['trainer0_ip:port', 'trainer1_ip:port', ...] " + "list of all trainer endpoints") + .SetDefault({}); + AddAttr("trainer_id", + "(int) " + "The index of the trainer in distributed training."); + AddAttr("hccl_comm_num", + "(int default 1) " + "The number of nccl communicator num.") + .SetDefault(1); + AddAttr("use_hierarchical_allreduce", + "(bool default false) " + "Wheter to use hierarchical allreduce.") + .SetDefault(false); + AddAttr("hierarchical_allreduce_inter_nranks", + "(int default 1) " + "Wheter to use hierarchical allreduce.") + .SetDefault(-1); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(gen_hccl_id, ops::GenHCCLIdOp, ops::GenHCCLIdOpMaker); diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..15940a76f71105a57865c0c8e00b404d087e9485 --- /dev/null +++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc @@ -0,0 +1,350 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "glog/logging.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/split.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +constexpr char COMM_HEAD[] = "_pd_gen_comm_id_"; +#define HCCL_UNIQUE_ID_BYTES 1024 + +// Check system calls, such as socket, bind. +#define CHECK_SYS_CALL(call, name) \ + do { \ + int retval; \ + CHECK_SYS_CALL_VAL(call, name, retval); \ + } while (false) + +#define CHECK_SYS_CALL_VAL(call, name, retval) \ + do { \ + RETRY_SYS_CALL_VAL(call, name, retval); \ + if (retval == -1) { \ + PADDLE_THROW(platform::errors::Unavailable("Call to %s failed: %s", \ + name, strerror(errno))); \ + } \ + } while (false) + +#define RETRY_SYS_CALL_VAL(call, name, retval) \ + do { \ + retval = (call); \ + if (retval == -1 && \ + (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ + LOG(WARNING) << "Call " << name << " returned " << strerror(errno) \ + << " retry"; \ + } else { \ + break; \ + } \ + } while (true) + +static int SocketSend(int fd, const char* buffer, int size) { + int offset = 0; + int bytes = 0; + while (offset < size) { + bytes = send(fd, buffer + offset, size - offset, 0); + if (bytes == -1) { + if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { + // send failed + return -1; + } else { + bytes = 0; + } + } + offset += bytes; + } + return offset; +} + +static int SocketRecv(int fd, char* buffer, int size) { + int offset = 0; + int bytes = 0; + while (offset < size) { + bytes = recv(fd, buffer + offset, size - offset, 0); + if (bytes == 0) { + // closed by client, maybe probing alive client + return 0; + } + if (bytes == -1) { + if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { + return -1; + } else { + bytes = 0; + } + } + offset += bytes; + } + return offset; +} + +static void BindOrConnectFailed(int timeout, int* try_times, int* total_time, + const char* op, const std::string& ep) { + PADDLE_ENFORCE_LT( + *total_time, timeout, + platform::errors::Unavailable("%s addr=%s timeout, failed reason: %s", op, + ep.c_str(), strerror(errno))); + ++(*try_times); + int retry_time = std::min(*try_times * 500, 3000); // max 3 seconds + *total_time += retry_time; + + LOG(WARNING) << op << " addr=" << ep << " failed " << *try_times + << " times with reason: " << strerror(errno) << " retry after " + << retry_time / 1000.0 << " seconds"; + std::this_thread::sleep_for(std::chrono::milliseconds(retry_time)); +} + +int CreateListenSocket(const std::string& ep) { + auto addr = paddle::string::Split(ep, ':'); + PADDLE_ENFORCE_EQ( + addr.size(), 2UL, + platform::errors::InvalidArgument( + "The endpoint should contain host and port, but got %s.", ep)); + std::string host = addr[0]; + int port = std::stoi(addr[1]); + + // creating socket fd + int server_fd = -1; + CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", server_fd); + + // NOTE. Solutions to `Address already in use`. + // 1. Reuse addr&port. Otherwise, once the server closes the socket + // before client, the server will enter TIME-WAIT status. If we bind port + // again, the error `Address already in use` will appear. + // 2. Or we can close the client first to ensure that the server does + // not enter the TIME-WAIT state. But this is obviously not as convenient + // as the reuse method. + int opt = 1; +#if defined(SO_REUSEPORT) + // since Linux kernel 3.9 + CHECK_SYS_CALL(setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, + &opt, sizeof(opt)), + "setsockopt"); +#else + CHECK_SYS_CALL( + setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), + "setsockopt"); +#endif + + struct sockaddr_in address; + address.sin_family = AF_INET; + address.sin_addr.s_addr = INADDR_ANY; + address.sin_port = htons(port); + + // TODO(wangxi) Set from env, default 900s=15min + int timeout = 900 * 1000; + int try_times = 0; + int total_time = 0; + while (true) { + int ret_val = -1; + RETRY_SYS_CALL_VAL( + bind(server_fd, (struct sockaddr*)&address, sizeof(address)), "bind", + ret_val); + + if (ret_val == -1) { + BindOrConnectFailed(timeout, &try_times, &total_time, "bind", ep); + continue; + } + break; + } + + CHECK_SYS_CALL(listen(server_fd, 3), "listen"); + LOG(INFO) << "Server listening on: " << ep << " successful."; + return server_fd; +} + +void CloseSocket(int fd) { CHECK_SYS_CALL(close(fd), "close"); } + +static int SocketAccept(int server_fd, const char* head) { + struct sockaddr_in client_addr; + socklen_t addr_length = sizeof(client_addr); + char buffer[1024] = {0}; + int conn = -1; + + while (true) { + CHECK_SYS_CALL_VAL( + accept(server_fd, reinterpret_cast(&client_addr), + &addr_length), + "accept", conn); + + int ret_val = SocketRecv(conn, buffer, strlen(head)); + if (ret_val > 0 && strncmp(buffer, head, strlen(head)) == 0) { + break; // accept client + } else { + VLOG(3) << "socket read failed with ret_val=" << ret_val; + CloseSocket(conn); + } + } + return conn; +} + +static int ConnectAddr(const std::string& ep, const char* head) { + auto addr = paddle::string::Split(ep, ':'); + PADDLE_ENFORCE_EQ( + addr.size(), 2UL, + platform::errors::InvalidArgument( + "The endpoint should contain host and port, but got %s.", ep)); + std::string host = addr[0]; + int port = std::stoi(addr[1]); + + int sock = -1; + CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", sock); + + struct sockaddr_in server_addr; + memset(&server_addr, 0, sizeof(server_addr)); + server_addr.sin_family = AF_INET; + server_addr.sin_port = htons(port); + + char* ip = NULL; + struct hostent* hp = NULL; + hp = gethostbyname(host.c_str()); + PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument( + "Fail to get host by name %s.", host)); + + int i = 0; + while (hp->h_addr_list[i] != NULL) { + ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]); + VLOG(3) << "gethostbyname host:" << host << " ->ip: " << ip; + break; + } + + PADDLE_ENFORCE_GT(inet_pton(AF_INET, ip, &server_addr.sin_addr), 0, + platform::errors::Unavailable("Open address %s failed: %s", + ep, strerror(errno))); + + // TODO(wangxi) Set from env, default 900s=15min + int timeout = 900 * 1000; + int try_times = 0; + int total_time = 0; + while (true) { + int ret_val = -1; + RETRY_SYS_CALL_VAL( + connect(sock, (struct sockaddr*)&server_addr, sizeof(server_addr)), + "connect", ret_val); + + if (ret_val == -1) { + BindOrConnectFailed(timeout, &try_times, &total_time, "connect", ep); + continue; + } + + CHECK_SYS_CALL(SocketSend(sock, head, strlen(head)), "send"); + break; + } + return sock; +} + +static void RecvHCCLID(int conn, HcclRootInfo* hccl_id) { + char buffer[1024] = {0}; + static_assert(HCCL_UNIQUE_ID_BYTES <= 1024, + "hccl id bytes must <= buffer size"); + + CHECK_SYS_CALL(SocketRecv(conn, buffer, HCCL_UNIQUE_ID_BYTES), + "recv hccl id"); + memcpy(hccl_id, buffer, HCCL_UNIQUE_ID_BYTES); +} + +static void SendHCCLID(int conn, HcclRootInfo* hccl_id) { + char buffer[1024] = {0}; + memcpy(buffer, hccl_id, HCCL_UNIQUE_ID_BYTES); + + CHECK_SYS_CALL(SocketSend(conn, buffer, HCCL_UNIQUE_ID_BYTES), + "send hccl id"); +} + +void SendBroadCastHCCLID(std::vector servers, int hccl_comm_num, + std::function func, + const framework::Scope& scope) { + // connect with server + std::vector connects; + for (auto server : servers) { + VLOG(3) << "connecting endpoint: " << server; + int conn = ConnectAddr(server, COMM_HEAD); + connects.push_back(conn); + } + VLOG(3) << "connecting completed..."; + + for (int i = 0; i < hccl_comm_num; ++i) { + std::string var_name = func(i); + auto var = scope.FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound("Variable with name %s is not found", + var_name.c_str())); + auto hccl_id = var->GetMutable(); + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(hccl_id)); + + int j = 0; + for (auto conn : connects) { + VLOG(3) << "sending hccl_id_var: " << var_name << " to " << servers[j] + << " hccl_comm_no: " << i; + SendHCCLID(conn, hccl_id); + ++j; + } + VLOG(3) << "sending completed..."; + } + + // close client + for (auto conn : connects) { + CloseSocket(conn); + } +} + +void RecvBroadCastHCCLID(std::string endpoint, int hccl_comm_num, + std::function func, + const framework::Scope& scope) { + int server = CreateListenSocket(endpoint); + RecvBroadCastHCCLID(server, endpoint, hccl_comm_num, func, scope); + CloseSocket(server); +} + +void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int hccl_comm_num, + std::function func, + const framework::Scope& scope) { + int client = SocketAccept(server_fd, COMM_HEAD); + + for (int i = 0; i < hccl_comm_num; ++i) { + std::string var_name = func(i); + auto var = scope.FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound("Variable with name %s is not found", + var_name.c_str())); + auto hccl_id = var->GetMutable(); + + VLOG(3) << "trainer: " << endpoint << " receiving hccl_id_var: " << var_name + << " from trainer 0, hccl_comm_no: " << i; + RecvHCCLID(client, hccl_id); + } + VLOG(3) << "receiving completed..."; + CloseSocket(client); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..1ad6f791e1fc34d71b982c24cd04c938db71e82f --- /dev/null +++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +namespace paddle { +namespace framework { +class Scope; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { + +int CreateListenSocket(const std::string& ep); + +void CloseSocket(int fd); + +void SendBroadCastHCCLID(std::vector servers, int nccl_comm_num, + std::function func, + const framework::Scope& scope); + +// server listen on endpoint, then recv nccl id +void RecvBroadCastHCCLID(std::string endpoint, int nccl_comm_num, + std::function func, + const framework::Scope& scope); + +// recv nccl id from socket +void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int nccl_comm_num, + std::function func, + const framework::Scope& scope); +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/gen_nccl_id_op.cc b/paddle/fluid/operators/collective/gen_nccl_id_op.cc index 679713d05bcb4025e1d204f62a72c1d3f647316a..99a92469e8502bbc500d627899f3c56fa6bccd66 100644 --- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc @@ -34,6 +34,7 @@ class Scope; namespace paddle { namespace operators { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) static void GenNCCLID(std::vector* nccl_ids) { for (size_t i = 0; i < nccl_ids->size(); ++i) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -194,6 +195,20 @@ class GenNCCLIdOp : public framework::OperatorBase { } }; +#else +class GenNCCLIdOp : public framework::OperatorBase { + public: + GenNCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override {} +}; + +#endif + class GenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { diff --git a/paddle/fluid/operators/collective/recv_v2_op.cc b/paddle/fluid/operators/collective/recv_v2_op.cc index 0ae7b821617f9144c09c6912be47c2be7e38da69..39a9ed0c74ef59d8520147572b9ab0da8c567da2 100644 --- a/paddle/fluid/operators/collective/recv_v2_op.cc +++ b/paddle/fluid/operators/collective/recv_v2_op.cc @@ -70,6 +70,12 @@ class RecvOpV2Maker : public framework::OpProtoAndCheckerMaker { AddAttr("peer", "(int default 0) rank id for sender.").SetDefault(0); AddAttr("dtype", "(int default 5('float32')) data type of tensor.") .SetDefault(5); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for broadcasting.") + .SetDefault("tag"); + AddAttr("srTag", "(string default tag) tag for broadcasting.") + .SetDefault(0); +#endif AddAttr>("out_shape", "shape of the output tensor.") .SetDefault(std::vector()); AddAttr( diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..69f1f4681a33d68d9a4d0efa09bd33d01834cff6 --- /dev/null +++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc @@ -0,0 +1,79 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/recv_v2_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CRecvOpASCENDKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto x = ctx.Output("Out"); + void* ptr = reinterpret_cast(const_cast(x->data())); + int numel = x->numel(); + HcclDataType dtype = platform::ToHCCLDataType(x->type()); + + int ring_id = ctx.Attr("ring_id"); + auto place = ctx.GetPlace(); + auto comm = + paddle::platform::HCCLCommContext::Instance().Get(ring_id, place); + + aclrtStream stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + if (ctx.Attr("use_calc_stream")) { + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + int nranks = comm->nranks(); + int peer = ctx.Attr("peer"); + + PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument( + "The nranks must be 2, but (%d)", nranks)); + + int root = peer; + + VLOG(3) << "begin hccl recv, parameter is: " + << "root " << root << ", comm: " << comm->comm() + << ", stream: " << stream; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast( + ptr, numel, dtype, (uint32_t)root, comm->comm(), stream)); + +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(recv_v2, ops::CRecvOpASCENDKernel, + ops::CRecvOpASCENDKernel, + ops::CRecvOpASCENDKernel, + ops::CRecvOpASCENDKernel); diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..384dfd1fc5f2d3140d5d9278624a65348cc72132 --- /dev/null +++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc @@ -0,0 +1,165 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#include "paddle/fluid/operators/collective/recv_v2_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(recv_v2); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(recv_v2, NPU); + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx) { + std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl; + + int num = atoi(getenv("DATA_SIZE")); + EXPECT_GT(num, 0); + EXPECT_LT(num, 1 << 15); + int rank_id = atoi(getenv("RANK_ID")); + VLOG(3) << "rank_id:" << rank_id << std::endl; + + ctx.Wait(); + auto place = ctx.GetPlace(); + auto out = scope->Var("Data"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num, num}); + tensor_out->mutable_data(place); // allocate + + ctx.Wait(); + + f::AttributeMap attrs; + attrs["tag"] = std::string("srtest"); + attrs["peer"] = atoi(getenv("SRC_RANK")); + attrs["ring_id"] = 0; + attrs["srTag"] = 0; + std::vector out_shape; + out_shape.push_back(num); + out_shape.push_back(num); + attrs["out_shape"] = out_shape; + + auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Data"}}}, attrs); + VLOG(3) << "CreateOp recv_v2"; + + for (int i = 0; i < 10; i++) { + op->Run(*scope, place); + } + VLOG(3) << "Run op recv_v2"; + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + std::vector init(num * num, 1.0 * atoi(getenv("DEST_RANK"))); + EXPECT_EQ(out_vec == init, true); +} + +TEST(recv_v2, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + char* npu_id = getenv("FLAGS_selected_npus"); + VLOG(3) << "Select npu:" << npu_id; + p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHcomRecvOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/collective/send_v2_op.cc b/paddle/fluid/operators/collective/send_v2_op.cc index c5a86b4f08813a3bb548bec829b7d07f16681043..c60d560e43baed37d1fc4392e8afc356ffdbd949 100644 --- a/paddle/fluid/operators/collective/send_v2_op.cc +++ b/paddle/fluid/operators/collective/send_v2_op.cc @@ -50,6 +50,12 @@ class SendOpV2Maker : public framework::OpProtoAndCheckerMaker { AddAttr("ring_id", "(int default 0) nccl communication ring id.") .SetDefault(0); AddAttr("peer", "(int default 0) rank id for receiver.").SetDefault(0); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for broadcasting.") + .SetDefault("tag"); + AddAttr("srTag", "(string default tag) tag for broadcasting.") + .SetDefault(0); +#endif AddAttr( "use_calc_stream", "(bool default false) eject CUDA operations to calculation stream.") diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..0ade090fcaac073875de1c1822fa0c45ad4f674b --- /dev/null +++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc @@ -0,0 +1,79 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/send_v2_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CSendOpASCENDKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto x = ctx.Input("X"); + void* ptr = reinterpret_cast(const_cast(x->data())); + int numel = x->numel(); + HcclDataType dtype = platform::ToHCCLDataType(x->type()); + + int ring_id = ctx.Attr("ring_id"); + auto place = ctx.GetPlace(); + auto comm = + paddle::platform::HCCLCommContext::Instance().Get(ring_id, place); + + aclrtStream stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + if (ctx.Attr("use_calc_stream")) { + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + int nranks = comm->nranks(); + int rank = comm->rank(); + + PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument( + "The nranks must be 2, but (%d)", nranks)); + + int root = rank; + + VLOG(3) << "begin hccl send, parameter is: " + << "root " << root << ", comm: " << comm->comm() + << ", stream: " << stream; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast( + ptr, numel, dtype, (uint32_t)root, comm->comm(), stream)); + +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(send_v2, ops::CSendOpASCENDKernel, + ops::CSendOpASCENDKernel, + ops::CSendOpASCENDKernel, + ops::CSendOpASCENDKernel); diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..cf01b1d0a6a1d12f98a4715f346fcdcb0bb6ae05 --- /dev/null +++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc @@ -0,0 +1,154 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#include "paddle/fluid/operators/collective/send_v2_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(send_v2); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(send_v2, NPU); + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx) { + std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl; + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + int num = atoi(getenv("DATA_SIZE")); + + EXPECT_GT(num, 0); + EXPECT_LT(num, 1 << 15); + std::vector init(num * num, 1.0 * atoi(getenv("DEST_RANK"))); + int rank_id = atoi(getenv("RANK_ID")); + VLOG(3) << "rank id:" << rank_id; + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num, num}); + ctx.Wait(); + auto place = ctx.GetPlace(); + ctx.Wait(); + + f::AttributeMap attrs; + attrs["tag"] = std::string("srtest"); + attrs["peer"] = atoi(getenv("DEST_RANK")); + attrs["ring_id"] = 0; + attrs["srTag"] = 0; + + auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"Data"}}}, {}, attrs); + + for (int i = 0; i < 10; i++) { + op->Run(*scope, place); + } + VLOG(3) << "send run over"; + ctx.Wait(); +} + +TEST(send_v2, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + char* npu_id = getenv("FLAGS_selected_npus"); + VLOG(3) << "Select npu:" << npu_id; + p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHcomSendOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..87bb3397ca2672ce377b74682cb0445e31b03677 --- /dev/null +++ b/paddle/fluid/operators/concat_op_npu.cc @@ -0,0 +1,126 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/concat_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class ConcatNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto ins = ctx.MultiInput("X"); + framework::LoDTensor* out = ctx.Output("Out"); + PADDLE_ENFORCE_NOT_NULL(ins[0], + platform::errors::NotFound( + "The first input tensor is not initalized.")); + auto axis = ctx.Attr("axis"); + + if (ctx.HasInput("AxisTensor")) { + PADDLE_THROW(platform::errors::NotFound( + "The AxisTensor is not supported on NPU now.")); + } + axis = ComputeAxis(static_cast(axis), + static_cast(ins[0]->dims().size())); + + auto place = ctx.GetPlace(); + out->mutable_data(place); + + std::vector inputs; + std::vector names; + for (size_t i = 0; i < ins.size(); ++i) { + if (ins[i] && ins[i]->numel() > 0) { + inputs.push_back(*ins[i]); + names.push_back("x" + std::to_string(i)); + } else { + continue; + } + } + auto stream = + ctx.template device_context() + .stream(); + auto runner = NpuOpRunner( + "ConcatD", {inputs}, {*out}, + {{"concat_dim", axis}, {"N", static_cast(inputs.size())}}); + runner.AddInputNames(names); + runner.Run(stream); + } +}; + +template +class ConcatGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto ins = ctx.MultiInput("X"); + auto out_var_names = ctx.OutputNames(framework::GradVarName("X")); + auto outs = + ctx.MultiOutput(framework::GradVarName("X")); + + PADDLE_ENFORCE_NOT_NULL(ins[0], + platform::errors::NotFound( + "The first input tensor is not initalized.")); + + auto axis = ctx.Attr("axis"); + + axis = ComputeAxis(static_cast(axis), + static_cast(ins[0]->dims().size())); + + int offset = 0; + auto stream = + ctx.template device_context() + .stream(); + for (size_t j = 0; j < outs.size(); ++j) { + // For stop gradient + // get output tensor that the name is not kEmptyVarName + if (out_var_names[j] != framework::kEmptyVarName && + outs[j]->numel() != 0UL) { + outs[j]->mutable_data(ctx.GetPlace()); + std::vector offsets; + std::vector sizes; + for (int dim = 0; dim < ins[j]->dims().size(); ++dim) { + if (dim == axis) { + offsets.push_back(offset); + sizes.push_back(ins[j]->dims()[dim]); + } else { + offsets.push_back(0); + sizes.push_back(ins[j]->dims()[dim]); + } + } + auto runner = NpuOpRunner("SliceD", {*out_grad}, {*outs[j]}, + {{"offsets", offsets}, {"size", sizes}}); + runner.Run(stream); + } + if (ins[j]->numel() != 0UL) { + offset += ins[j]->dims()[axis]; + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL(concat, ops::ConcatNPUKernel, + ops::ConcatNPUKernel, + ops::ConcatNPUKernel); + +REGISTER_OP_NPU_KERNEL(concat_grad, ops::ConcatGradNPUKernel, + ops::ConcatGradNPUKernel, + ops::ConcatGradNPUKernel); diff --git a/paddle/fluid/operators/concat_op_xpu.cc b/paddle/fluid/operators/concat_op_xpu.cc index aa0002cc6d1777dab6e598fc7c123e5255d0f094..be299babdba7a4f450bafdf5dce8e686f0493fce 100644 --- a/paddle/fluid/operators/concat_op_xpu.cc +++ b/paddle/fluid/operators/concat_op_xpu.cc @@ -132,16 +132,14 @@ class ConcatGradXPUKernel : public framework::OpKernel { axis = ComputeAxis(static_cast(axis), static_cast(ins[0]->dims().size())); // get output tensor that the name is not kEmptyVarName - std::vector outputs; - std::vector choose_idx; - int n = 0; + std::vector ptrs(outs.size()); for (size_t j = 0; j < outs.size(); ++j) { if (out_var_names[j] != framework::kEmptyVarName && outs[j]->numel() != 0UL) { outs[j]->mutable_data(ctx.GetPlace()); - outputs.push_back(outs[j]); - choose_idx.push_back(j); - n++; + ptrs[j] = outs[j]->data(); + } else { + ptrs[j] = nullptr; } } PADDLE_ENFORCE_GE(axis, 0, platform::errors::InvalidArgument( @@ -157,10 +155,10 @@ class ConcatGradXPUKernel : public framework::OpKernel { axis, out_grad->dims().size())); auto input_dims = ins[0]->dims(); - std::vector split_list(n); + std::vector split_list(ins.size()); std::vector xdims_list(input_dims.size()); int total_length = 0; - for (int i = 0; i < n; ++i) { + for (size_t i = 0; i < ins.size(); ++i) { split_list[i] = ins[i]->dims()[axis]; total_length += ins[i]->dims()[axis]; } @@ -172,11 +170,6 @@ class ConcatGradXPUKernel : public framework::OpKernel { } xdims_list[axis] = total_length; - std::vector ptrs(n); - for (int i = 0; i < n; ++i) { - ptrs[i] = outputs[i]->data(); - } - auto& dev_ctx = ctx.template device_context(); int r = xpu::split(dev_ctx.x_context(), out_grad->data(), ptrs, xdims_list, split_list, axis); diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc index 3cad86d96c26a0e25fcbaeb02405315895744e50..bf047de86fc21a4d5d9e9ff8f20c9a1982eb25af 100644 --- a/paddle/fluid/operators/controlflow/compare_op.cc +++ b/paddle/fluid/operators/controlflow/compare_op.cc @@ -23,29 +23,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -class CompareOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEM_TYPE; - using Tensor = framework::Tensor; - - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* z = context.Output("Out"); - int axis = context.Attr("axis"); - - if (x->numel() == 1 && y->numel() == 1) { - bool* z_data = z->mutable_data(context.GetPlace()); - z_data[0] = Functor()(x->data()[0], y->data()[0]); - } else { - ElementwiseComputeEx( - context, x, y, axis, Functor(), z); - } - } -}; - template class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: @@ -153,16 +130,22 @@ class CompareOp : public framework::OperatorWithKernel { REGISTER_COMPARE_OP_VERSION(op_type); REGISTER_COMPARE_OP(less_than, "Out = X < Y"); -REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor); +REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor, + paddle::operators::GreaterEqualFunctor); REGISTER_COMPARE_OP(less_equal, "Out = X <= Y"); -REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor); +REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor, + paddle::operators::GreaterThanFunctor); REGISTER_COMPARE_OP(greater_than, "Out = X > Y"); REGISTER_COMPARE_KERNEL(greater_than, CPU, - paddle::operators::GreaterThanFunctor); + paddle::operators::GreaterThanFunctor, + paddle::operators::LessEqualFunctor); REGISTER_COMPARE_OP(greater_equal, "Out = X >= Y"); REGISTER_COMPARE_KERNEL(greater_equal, CPU, - paddle::operators::GreaterEqualFunctor); + paddle::operators::GreaterEqualFunctor, + paddle::operators::LessThanFunctor); REGISTER_COMPARE_OP(equal, "Out = X == Y"); -REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor); +REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor, + paddle::operators::EqualFunctor); REGISTER_COMPARE_OP(not_equal, "Out = X != Y"); -REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor); +REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor, + paddle::operators::NotEqualFunctor); diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu index b1f306358359764b919f9e570cf44f9733a7d178..3ca700e16e6e7bcf4136ca68dd895593a63824ec 100644 --- a/paddle/fluid/operators/controlflow/compare_op.cu +++ b/paddle/fluid/operators/controlflow/compare_op.cu @@ -14,11 +14,17 @@ limitations under the License. */ #include "paddle/fluid/operators/controlflow/compare_op.h" -REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor); -REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor); -REGISTER_COMPARE_KERNEL(greater_than, CUDA, +REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor, + paddle::operators::GreaterEqualFunctor); +REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor, paddle::operators::GreaterThanFunctor); +REGISTER_COMPARE_KERNEL(greater_than, CUDA, + paddle::operators::GreaterThanFunctor, + paddle::operators::LessEqualFunctor); REGISTER_COMPARE_KERNEL(greater_equal, CUDA, - paddle::operators::GreaterEqualFunctor); -REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor); -REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor); + paddle::operators::GreaterEqualFunctor, + paddle::operators::LessThanFunctor); +REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor, + paddle::operators::EqualFunctor); +REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor, + paddle::operators::NotEqualFunctor); diff --git a/paddle/fluid/operators/controlflow/compare_op.h b/paddle/fluid/operators/controlflow/compare_op.h index b7529e4ae632d31524846d9d5aa4b1883f4509a1..ff929ee7dfce79536a9ce7c8ae6878fb7e3871e9 100644 --- a/paddle/fluid/operators/controlflow/compare_op.h +++ b/paddle/fluid/operators/controlflow/compare_op.h @@ -68,7 +68,7 @@ struct NotEqualFunctor { } }; -template +template class CompareOpKernel : public framework::OpKernel { public: @@ -80,21 +80,33 @@ class CompareOpKernel auto* y = context.Input("Y"); auto* z = context.Output("Out"); int axis = context.Attr("axis"); - ElementwiseComputeEx(context, x, y, axis, - Functor(), z); + + auto x_dims = x->dims(); + auto y_dims = y->dims(); + if (x_dims.size() >= y_dims.size()) { + ElementwiseComputeEx(context, x, y, axis, + Functor(), z); + } else { + ElementwiseComputeEx( + context, x, y, axis, InverseFunctor(), z); + } } }; } // namespace operators } // namespace paddle -#define REGISTER_COMPARE_KERNEL(op_type, dev, functor) \ - REGISTER_OP_##dev##_KERNEL( \ - op_type, ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>); +#define REGISTER_COMPARE_KERNEL(op_type, dev, functor, inverse_functor) \ + REGISTER_OP_##dev##_KERNEL(op_type, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, \ + functor, inverse_functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, \ + functor, inverse_functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, \ + functor, inverse_functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, \ + functor, inverse_functor>); diff --git a/paddle/fluid/operators/controlflow/compare_op_npu.cc b/paddle/fluid/operators/controlflow/compare_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..591fb55936734ffc675dad5c6912e7cbf4e80471 --- /dev/null +++ b/paddle/fluid/operators/controlflow/compare_op_npu.cc @@ -0,0 +1,78 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" +#include "paddle/fluid/operators/npu_op_runner.h" +#ifdef PADDLE_WITH_ASCEND_CL + +namespace paddle { +namespace operators { + +template +class EqualNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("Equal", {*x, *y}, {*out}, {}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class LessThanNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + // int axis = context.Attr("axis"); + z->mutable_data(ctx.GetPlace()); // allocate + auto runner = NpuOpRunner("Less", {*x, *y}, {*z}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(equal, ops::EqualNPUKernel, + ops::EqualNPUKernel, + ops::EqualNPUKernel); + +REGISTER_OP_NPU_KERNEL( + less_than, + ops::LessThanNPUKernel, + ops::LessThanNPUKernel); + +#endif diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h index b9ea2ade6cb90b71d423ba977215ab693f19b562..6513bae839e9894ee2b342c5d61fcbf9191a4123 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.h +++ b/paddle/fluid/operators/controlflow/conditional_block_op.h @@ -78,6 +78,13 @@ class ConditionalOp : public framework::OperatorBase { framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor); platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait(); res = cpu_tensor.data()[0]; +#endif + } else if (platform::is_npu_place(ips[0]->place())) { +#ifdef PADDLE_WITH_ASCEND_CL + framework::LoDTensor cpu_tensor; + framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor); + platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait(); + res = cpu_tensor.data()[0]; #endif } else { res = ips[0]->data()[0]; diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h b/paddle/fluid/operators/controlflow/conditional_block_op_helper.h index 22eb2ece4b05b8ad7fad3acdc545e3c98d211f31..7ce63aa9cbbfaaa4adb7834dd33e24cb6491a7a9 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h +++ b/paddle/fluid/operators/controlflow/conditional_block_op_helper.h @@ -19,6 +19,7 @@ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/controlflow/conditional_block_op.h" +#include "paddle/fluid/string/string_helper.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc index d86b6b48422d94604724303de72f401bfba2e23e..fdd1b776bd8fa3f24fb596af29512f1f781dce4c 100644 --- a/paddle/fluid/operators/controlflow/fetch_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_op.cc @@ -44,6 +44,11 @@ static void DataCopy(const framework::LoDTensor &src_item, TensorCopySync(src_item, platform::CPUPlace(), dst_item); } #else +#ifdef PADDLE_WITH_ASCEND_CL + if (platform::is_npu_place(src_item.place())) { + platform::DeviceContextPool::Instance().Get(src_item.place())->Wait(); + } +#endif TensorCopySync(src_item, platform::CPUPlace(), dst_item); #endif } else { diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..1b0c0e444347af0a90f8244590b84199dc97f931 --- /dev/null +++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include + +#include "paddle/fluid/operators/controlflow/logical_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class LogicalNotNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("LogicalNot", {*x}, {*out}, {}); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + logical_not, + ops::LogicalNotNPUKernel); + +#endif diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu index 110bb69a14083ebbe26c1e26e6cf585b70f6b825..7fdb1ccfe9614fc0b30c7e13f564ece217c08b36 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ b/paddle/fluid/operators/conv_cudnn_op.cu @@ -249,6 +249,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { args.handle = handle; #ifdef PADDLE_WITH_HIP + // MIOPEN need to set groups in cdesc in miopen_desc.h args.cdesc.set(dtype, padding_common, strides, dilations, platform::AllowTF32Cudnn(), groups); #else @@ -264,6 +265,10 @@ class CUDNNConvOpKernel : public framework::OpKernel { platform::dynload::cudnnSetConvolutionGroupCount(args.cdesc.desc(), groups)); groups = 1; +#endif +#ifdef PADDLE_WITH_HIP + // MIOPEN do not set groups in wdesc after set groups in cdesc + groups = 1; #endif args.idesc.set(transformed_input, layout_format); args.wdesc.set(transformed_filter_channel, layout_format, groups); @@ -292,12 +297,14 @@ class CUDNNConvOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP miopenConvFwdAlgorithm_t algo{}; using search = SearchAlgorithm; + workspace_size = search::GetWorkspaceSize(args); + algo = search::Find(args, exhaustive_search, false, workspace_size, ctx); #else cudnnConvolutionFwdAlgo_t algo{}; using search = SearchAlgorithm; -#endif algo = search::Find(args, exhaustive_search, false, ctx); workspace_size = search::GetWorkspaceSize(args, algo); +#endif #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\ @@ -652,13 +659,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search1::GetWorkspaceSize(args1)); + data_algo = search1::Find(args1, exhaustive_search, deterministic, + workspace_size, ctx); #else using search1 = SearchAlgorithm; -#endif data_algo = search1::Find(args1, exhaustive_search, deterministic, ctx); workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); +#endif } if (filter_grad) { @@ -673,38 +684,68 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { platform::AllowTF32Cudnn(), c_groups); #ifdef PADDLE_WITH_HIP using search2 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search2::GetWorkspaceSize(args2)); + filter_algo = search2::Find(args2, exhaustive_search, deterministic, + workspace_size, ctx); #else using search2 = SearchAlgorithm; -#endif filter_algo = search2::Find(args2, exhaustive_search, deterministic, ctx); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2, filter_algo)); +#endif } // ------------------- cudnn conv backward data --------------------- ScalingParamType alpha = 1.0f; +#ifdef PADDLE_WITH_HIP + // MIOPEN ONLY support beta to be 0.0f + ScalingParamType beta = 0.0f; +#else ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : 0.0f; +#endif VLOG(4) << "Conv_grad: use_addto = " << ctx.Attr("use_addto"); if (input_grad) { - // When beta is 0, it is unnecessary to reset input_grad. - // When beta is 1, the output cannot be reset since addt strategy used. - for (int i = 0; i < groups; i++) { +// When beta is 0, it is unnecessary to reset input_grad. +// When beta is 1, the output cannot be reset since addt strategy used. #ifdef PADDLE_WITH_HIP + if (ctx.Attr("use_addto")) { + Tensor temp_tensor(transformed_input_grad.type()); + temp_tensor.Resize(transformed_input_grad.dims()); + T* temp_tensor_data = temp_tensor.mutable_data(ctx.GetPlace()); workspace_handle.RunFunc( [&](void* cudnn_workspace_ptr) { PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args1.odesc.desc(), - output_grad_data + i * group_offset_out, - args1.wdesc.desc(), filter_data + i * group_offset_filter, - args1.cdesc.desc(), data_algo, &beta, args1.idesc.desc(), - transformed_input_grad_data + i * group_offset_in, + handle, &alpha, args1.odesc.desc(), output_grad_data, + args1.wdesc.desc(), filter_data, args1.cdesc.desc(), + data_algo, &beta, args1.idesc.desc(), temp_tensor_data, cudnn_workspace_ptr, workspace_size)); }, workspace_size); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor( + handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(), + transformed_input_grad_data, &alpha, args1.idesc.desc(), + temp_tensor_data, &beta, args1.idesc.desc(), + transformed_input_grad_data)); + } else { + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenConvolutionBackwardData( + handle, &alpha, args1.odesc.desc(), output_grad_data, + args1.wdesc.desc(), filter_data, args1.cdesc.desc(), + data_algo, &beta, args1.idesc.desc(), + transformed_input_grad_data, cudnn_workspace_ptr, + workspace_size)); + }, + workspace_size); + } + #else + for (int i = 0; i < groups; i++) { workspace_handle.RunFunc( [&](void* cudnn_workspace_ptr) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -717,9 +758,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { transformed_input_grad_data + i * group_offset_in)); }, workspace_size); -#endif } - +#endif if (!is_sys_pad) { std::vector starts(transformed_input_channel.dims().size(), 0); std::vector axes(transformed_input_channel.dims().size(), 0); @@ -751,23 +791,20 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { ScalingParamType beta_filter = 0.0f; // ------------------- cudnn conv backward filter --------------------- if (filter_grad) { - // Because beta is zero, it is unnecessary to reset filter_grad. - for (int i = 0; i < groups; i++) { +// Because beta is zero, it is unnecessary to reset filter_grad. #ifdef PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenConvolutionBackwardWeights( - handle, &alpha, args2.odesc.desc(), - output_grad_data + i * group_offset_out, - args2.idesc.desc(), input_data + i * group_offset_in, - args2.cdesc.desc(), filter_algo, &beta, - args2.wdesc.desc(), - filter_grad_data + i * group_offset_filter, - cudnn_workspace_ptr, workspace_size)); - }, - workspace_size); + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenConvolutionBackwardWeights( + handle, &alpha, args2.odesc.desc(), output_grad_data, + args2.idesc.desc(), input_data, args2.cdesc.desc(), + filter_algo, &beta, args2.wdesc.desc(), filter_grad_data, + cudnn_workspace_ptr, workspace_size)); + }, + workspace_size); #else + for (int i = 0; i < groups; i++) { workspace_handle.RunFunc( [&](void* cudnn_workspace_ptr) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -780,8 +817,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { filter_grad_data + i * group_offset_filter)); }, workspace_size); -#endif } +#endif if (compute_format == DataLayout::kNHWC) { TransToChannelFirst( @@ -1080,32 +1117,37 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; + workspace_size = search1::GetWorkspaceSize(args1); + fwd_algo1 = search1::Find(args1, exhaustive_search, false, + workspace_size, ctx); #else using search1 = SearchAlgorithm; -#endif fwd_algo1 = search1::Find(args1, exhaustive_search, false, ctx); workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1); +#endif } if (ddW) { ddw = ddW->data(); args2.handle = handle; args2.idesc.set(transformed_X, iwo_group); - args2.wdesc.set(*ddW, layout, iwo_group); - args2.odesc.set(transformed_ddO_channel, iwo_group); args2.cdesc.set(dtype, padding_common, strides, dilations, platform::AllowTF32Cudnn(), c_group); #ifdef PADDLE_WITH_HIP using search2 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search2::GetWorkspaceSize(args2)); + fwd_algo2 = search2::Find(args2, exhaustive_search, false, + workspace_size, ctx); #else using search2 = SearchAlgorithm; -#endif fwd_algo2 = search2::Find(args2, exhaustive_search, false, ctx); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2, fwd_algo2)); +#endif } } @@ -1114,21 +1156,23 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { args3.handle = handle; args3.idesc.set(transformed_ddX, iwo_group); args3.wdesc.set(*dW, layout, iwo_group); - args3.odesc.set(transformed_dO_channel, iwo_group); - args3.cdesc.set(dtype, padding_common, strides, dilations, platform::AllowTF32Cudnn(), c_group); #ifdef PADDLE_WITH_HIP using search3 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search3::GetWorkspaceSize(args3)); + filter_algo = search3::Find(args3, exhaustive_search, deterministic, + workspace_size, ctx); #else using search3 = SearchAlgorithm; -#endif filter_algo = search3::Find(args3, exhaustive_search, deterministic, ctx); workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo)); +#endif } if (ddW && dX) { @@ -1143,13 +1187,17 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP using search4 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search4::GetWorkspaceSize(args4)); + data_algo = search4::Find(args4, exhaustive_search, deterministic, + workspace_size, ctx); #else using search4 = SearchAlgorithm; -#endif data_algo = search4::Find(args4, exhaustive_search, deterministic, ctx); workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); +#endif } int i_n, i_c, i_d, i_h, i_w; @@ -1176,21 +1224,19 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { if (ddO) { if (ddX) { ddx = transformed_ddX.data(); - for (int i = 0; i < groups; i++) { #ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args1.idesc.desc(), - ddx + i * group_offset_in, args1.wdesc.desc(), - w + i * group_offset_filter, args1.cdesc.desc(), - fwd_algo1, &beta, args1.odesc.desc(), - transformed_ddy_channel + i * group_offset_out, - workspace_ptr, workspace_size)); - }, - workspace_size); + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenConvolutionForward( + handle, &alpha, args1.idesc.desc(), ddx, + args1.wdesc.desc(), w, args1.cdesc.desc(), fwd_algo1, + &beta, args1.odesc.desc(), transformed_ddy_channel, + workspace_ptr, workspace_size)); + }, + workspace_size); #else + for (int i = 0; i < groups; i++) { wkspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -1203,26 +1249,24 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { transformed_ddy_channel + i * group_offset_out)); }, workspace_size); -#endif } +#endif } if (ddW) { - for (int i = 0; i < groups; i++) { #ifdef PADDLE_WITH_HIP - // MIOPEN ONLY support beta to be 0.0f - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args2.idesc.desc(), - x + i * group_offset_in, args2.wdesc.desc(), - ddw + i * group_offset_filter, args2.cdesc.desc(), - fwd_algo2, &beta, args2.odesc.desc(), - transformed_ddy_channel + i * group_offset_out, - workspace_ptr, workspace_size)); - }, - workspace_size); + // MIOPEN ONLY support beta to be 0.0f + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenConvolutionForward( + handle, &alpha, args2.idesc.desc(), x, args2.wdesc.desc(), + ddw, args2.cdesc.desc(), fwd_algo2, &beta, + args2.odesc.desc(), transformed_ddy_channel, + workspace_ptr, workspace_size)); + }, + workspace_size); #else + for (int i = 0; i < groups; i++) { wkspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -1235,8 +1279,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { transformed_ddy_channel + i * group_offset_out)); }, workspace_size); -#endif } +#endif } if (channel_last) { TransToChannelLast( @@ -1246,21 +1290,19 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { T* transformed_dy_channel = transformed_dO_channel.data(); if (dW && ddX) { ddx = transformed_ddX.data(); - for (int i = 0; i < groups; i++) { #ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenConvolutionBackwardWeights( - handle, &alpha, args3.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args3.idesc.desc(), ddx + i * group_offset_in, - args3.cdesc.desc(), filter_algo, &beta, - args3.wdesc.desc(), dw + i * group_offset_filter, - workspace_ptr, workspace_size)); - }, - workspace_size); + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenConvolutionBackwardWeights( + handle, &alpha, args3.odesc.desc(), transformed_dy_channel, + args3.idesc.desc(), ddx, args3.cdesc.desc(), filter_algo, + &beta, args3.wdesc.desc(), dw, workspace_ptr, + workspace_size)); + }, + workspace_size); #else + for (int i = 0; i < groups; i++) { wkspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -1273,27 +1315,25 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { dw + i * group_offset_filter)); }, workspace_size); -#endif } +#endif } if (dX && ddW) { ddw = ddW->data(); - for (int i = 0; i < groups; i++) { #ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args4.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args4.wdesc.desc(), ddw + i * group_offset_filter, - args4.cdesc.desc(), data_algo, &beta, args4.idesc.desc(), - transformed_dx + i * group_offset_in, workspace_ptr, - workspace_size)); - }, - workspace_size); + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenConvolutionBackwardData( + handle, &alpha, args4.odesc.desc(), transformed_dy_channel, + args4.wdesc.desc(), ddw, args4.cdesc.desc(), data_algo, + &beta, args4.idesc.desc(), transformed_dx, workspace_ptr, + workspace_size)); + }, + workspace_size); #else + for (int i = 0; i < groups; i++) { wkspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -1306,8 +1346,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { transformed_dx + i * group_offset_in)); }, workspace_size); -#endif } +#endif if (!is_sys_pad) { // reverse padded input @@ -1350,7 +1390,14 @@ REGISTER_OP_KERNEL( conv2d_grad_grad, CUDNN, plat::CUDAPlace, paddle::operators::CUDNNConvDoubleGradOpKernel, paddle::operators::CUDNNConvDoubleGradOpKernel); - +// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue +// Use depthwise_conv2d in MIOPEN to resolve this issue +REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel); +REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNConvGradOpKernel, + paddle::operators::CUDNNConvGradOpKernel); REGISTER_OP_CUDA_KERNEL( depthwise_conv2d_grad_grad, paddle::operators::CUDNNConvDoubleGradOpKernel, diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h index ddddb7f8641ba1276dff4b01a1caaa6176f97714..23a471cfa006746c5762fa7169248cbae60a7899 100644 --- a/paddle/fluid/operators/conv_cudnn_op_cache.h +++ b/paddle/fluid/operators/conv_cudnn_op_cache.h @@ -40,11 +40,6 @@ static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT; static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT; -#else -// cuDNN v5 has no CUDNN_CONVOLUTION_FWD_ALGO_COUNT etc. -static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7; -static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4; -static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5; #endif } // namespace operators diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h index 44ead95a355a25e148fb2d6fcd10a552b92b7f86..befe09c8e6beb3d911521e4ff78f3427a3b0dd78 100644 --- a/paddle/fluid/operators/conv_miopen_helper.h +++ b/paddle/fluid/operators/conv_miopen_helper.h @@ -127,57 +127,32 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, + bool deterministic, size_t workspace_size, const framework::ExecutionContext& ctx) { - auto dtype = platform::CudnnDataType::type; - bool has_got_workspace_size = true; - size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; - size_t workspace_size = 0; algo_t algo; auto& dev_ctx = ctx.template device_context(); auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - auto& temp = ctx.cuda_device_context(); - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetForward()); - - auto x_dims = framework::vectorize(args.x->dims()); - auto w_dims = framework::vectorize(args.w->dims()); - - VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:" - << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" - << args.s << ", args.p" << args.p << ", args.d" << args.d; - - algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - int returned_algo_count; - std::array perf_stat; - - auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenFindConvolutionForwardAlgorithm( - args.handle, args.idesc.desc(), args.x->data(), - args.wdesc.desc(), args.w->data(), args.cdesc.desc(), - args.odesc.desc(), const_cast(args.o->data()), - kNUM_CUDNN_FWD_ALGS, &returned_algo_count, perf_stat.data(), - cudnn_workspace_ptr, workspace_size_limit, false)); - }; - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit); - - VLOG(3) << "FwdAlgo Perf result: (algo: stat, time, memory)"; - for (int i = 0; i < returned_algo_count; ++i) { - const auto& stat = perf_stat[i]; - VLOG(3) << stat.fwd_algo; - } - return perf_stat[0].fwd_algo; - }); + int find_count; + miopenConvAlgoPerf_t find_result; + auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenFindConvolutionForwardAlgorithm( + args.handle, args.idesc.desc(), args.x->data(), + args.wdesc.desc(), args.w->data(), args.cdesc.desc(), + args.odesc.desc(), const_cast(args.o->data()), + kNUM_CUDNN_FWD_ALGS, &find_count, &find_result, + cudnn_workspace_ptr, workspace_size, false)); + }; + + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); + algo = find_result.fwd_algo; VLOG(3) << "choose algo " << algo; return algo; } - static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { + static size_t GetWorkspaceSize(const ConvArgs& args) { size_t workspace_size = 0; PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::miopenConvolutionForwardGetWorkSpaceSize( @@ -194,58 +169,32 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, + bool deterministic, size_t workspace_size, const framework::ExecutionContext& ctx) { - auto dtype = platform::CudnnDataType::type; - size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; - size_t workspace_size = 0; - bool has_got_workspace_size = true; algo_t algo; auto& dev_ctx = ctx.template device_context(); auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetBackwardData()); - - auto x_dims = framework::vectorize(args.x->dims()); - auto w_dims = framework::vectorize(args.w->dims()); - - VLOG(10) << "miopenConvolutionFwdAlgoPerf_t" - << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" - << args.s << ", args.p" << args.p << ", args.d" << args.d; - - algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - int returned_algo_count; - std::array perf_stat; - - auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenFindConvolutionBackwardDataAlgorithm( - args.handle, args.odesc.desc(), args.o->data(), - args.wdesc.desc(), args.w->data(), args.cdesc.desc(), - args.idesc.desc(), const_cast(args.x->data()), - kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count, - perf_stat.data(), cudnn_workspace_ptr, workspace_size_limit, - false)); - }; - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit); - - VLOG(3) << "BwdDataAlgo Perf result: (algo: stat, time, memory)"; - for (int i = 0; i < returned_algo_count; ++i) { - const auto& stat = perf_stat[i]; - VLOG(3) << stat.bwd_data_algo; - } - - return perf_stat[0].bwd_data_algo; - }); + int find_count; + miopenConvAlgoPerf_t find_result; + auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenFindConvolutionBackwardDataAlgorithm( + args.handle, args.odesc.desc(), args.o->data(), + args.wdesc.desc(), args.w->data(), args.cdesc.desc(), + args.idesc.desc(), const_cast(args.x->data()), + kNUM_CUDNN_BWD_DATA_ALGS, &find_count, &find_result, + cudnn_workspace_ptr, workspace_size, false)); + }; + + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); + algo = find_result.bwd_data_algo; VLOG(3) << "choose algo " << algo; return algo; } - static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { + static size_t GetWorkspaceSize(const ConvArgs& args) { size_t workspace_size = 0; PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::miopenConvolutionBackwardDataGetWorkSpaceSize( @@ -262,56 +211,32 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, + bool deterministic, size_t workspace_size, const framework::ExecutionContext& ctx) { - auto dtype = platform::CudnnDataType::type; - size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; - size_t workspace_size = 0; - bool has_got_workspace_size = true; algo_t algo; auto& dev_ctx = ctx.template device_context(); auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetBackwardFilter()); - - auto x_dims = framework::vectorize(args.x->dims()); - auto w_dims = framework::vectorize(args.w->dims()); - - VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:" - << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" - << args.s << ", args.p" << args.p << ", args.d" << args.d; - - algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - int returned_algo_count; - std::array perf_stat; - auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload:: - miopenFindConvolutionBackwardWeightsAlgorithm( - args.handle, args.odesc.desc(), args.o->data(), - args.idesc.desc(), args.x->data(), args.cdesc.desc(), - args.wdesc.desc(), const_cast(args.w->data()), - kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count, - perf_stat.data(), cudnn_workspace_ptr, - workspace_size_limit, false)); - }; - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit); - - VLOG(3) << "BwdFilterAlgo Perf result: (algo: stat, time, memory)"; - for (int i = 0; i < returned_algo_count; ++i) { - const auto& stat = perf_stat[i]; - VLOG(3) << stat.bwd_weights_algo; - } - return perf_stat[0].bwd_weights_algo; - }); + + int find_count; + miopenConvAlgoPerf_t find_result; + auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenFindConvolutionBackwardWeightsAlgorithm( + args.handle, args.odesc.desc(), args.o->data(), + args.idesc.desc(), args.x->data(), args.cdesc.desc(), + args.wdesc.desc(), const_cast(args.w->data()), + kNUM_CUDNN_BWD_FILTER_ALGS, &find_count, &find_result, + cudnn_workspace_ptr, workspace_size, false)); + }; + + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); + algo = find_result.bwd_weights_algo; VLOG(3) << "choose algo " << algo; return algo; } - static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { + static size_t GetWorkspaceSize(const ConvArgs& args) { size_t workspace_size = 0; PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::miopenConvolutionBackwardWeightsGetWorkSpaceSize( diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index 364e3ab8d26c3f35f41f319b3d31b63964b93abe..94d1f707b74c2eae17d02771ad7d548e8b908dd9 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -903,29 +903,19 @@ class DepthwiseConvKernel : public framework::OpKernel { "and input channel number is %d", output->dims()[1], input->dims()[1])); } - // transform tensor - Tensor transformed_input(input->type()); - Tensor transformed_output(output->type()); - - if (channel_last) { - ResizeToChannelFirst(context, input, - &transformed_input); - TransToChannelFirst(context, input, &transformed_input); - - ResizeToChannelFirst(context, output, - &transformed_output); - - } else { - transformed_input = *input; - transformed_output = *output; - } // update padding and dilation - auto in_dims = transformed_input.dims(); + auto in_dims = input->dims(); auto filter_dims = filter.dims(); framework::DDim in_data_dims; - in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); + const framework::DataLayout data_layout = + framework::StringToDataLayout(data_format); + if (data_layout != framework::DataLayout::kNHWC) { + in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1); + } framework::DDim filter_data_dims = framework::slice_ddim(filter_dims, 2, filter_dims.size()); @@ -944,16 +934,12 @@ class DepthwiseConvKernel : public framework::OpKernel { if (fuse_relu) { math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, transformed_input, filter, strides, paddings, - dilations, &transformed_output); + depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, + output, data_layout); } else { math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, transformed_input, filter, strides, paddings, - dilations, &transformed_output); - } - if (channel_last) { - TransToChannelLast(context, &transformed_output, - output); + depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, + output, data_layout); } } }; @@ -981,33 +967,18 @@ class DepthwiseConvGradKernel : public framework::OpKernel { context.Attr("padding_algorithm"); const std::string data_format = context.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensor - Tensor transformed_input(input->type()); - Tensor transformed_output_grad(output_grad->type()); - - if (channel_last) { - ResizeToChannelFirst(context, input, - &transformed_input); - TransToChannelFirst(context, input, &transformed_input); - - ResizeToChannelFirst(context, output_grad, - &transformed_output_grad); - TransToChannelFirst(context, output_grad, - &transformed_output_grad); - - } else { - transformed_input = *input; - transformed_output_grad = *output_grad; - } - // update padding and dilation - auto in_dims = transformed_input.dims(); + auto in_dims = input->dims(); auto filter_dims = filter.dims(); framework::DDim in_data_dims; - in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); + const framework::DataLayout data_layout = + framework::StringToDataLayout(data_format); + if (data_layout != framework::DataLayout::kNHWC) { + in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1); + } framework::DDim filter_data_dims = framework::slice_ddim(filter_dims, 2, filter_dims.size()); std::vector ksize = framework::vectorize(filter_data_dims); @@ -1025,33 +996,18 @@ class DepthwiseConvGradKernel : public framework::OpKernel { if (input_grad) { input_grad->mutable_data(context.GetPlace()); - Tensor transformed_input_grad(input_grad->type()); - if (channel_last) { - ResizeToChannelFirst(context, input_grad, - &transformed_input_grad); - - } else { - transformed_input_grad = *input_grad; - } - - set_zero(dev_ctx, &transformed_input_grad, static_cast(0)); + set_zero(dev_ctx, input_grad, static_cast(0)); if (fuse_relu) { math::DepthwiseConvInputGradFunctor depthwiseConvInputGrad; - depthwiseConvInputGrad(dev_ctx, transformed_input, filter, - transformed_output_grad, strides, paddings, - dilations, &transformed_input_grad); + depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, + paddings, dilations, input_grad, data_layout); } else { math::DepthwiseConvInputGradFunctor depthwiseConvInputGrad; - depthwiseConvInputGrad(dev_ctx, transformed_input, filter, - transformed_output_grad, strides, paddings, - dilations, &transformed_input_grad); - } - if (channel_last) { - TransToChannelLast(context, &transformed_input_grad, - input_grad); + depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, + paddings, dilations, input_grad, data_layout); } } @@ -1061,15 +1017,13 @@ class DepthwiseConvGradKernel : public framework::OpKernel { if (fuse_relu) { math::DepthwiseConvFilterGradFunctor depthwiseConvFilterGrad; - depthwiseConvFilterGrad(dev_ctx, transformed_input, - transformed_output_grad, strides, paddings, - dilations, filter_grad); + depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, + paddings, dilations, filter_grad, data_layout); } else { math::DepthwiseConvFilterGradFunctor depthwiseConvFilterGrad; - depthwiseConvFilterGrad(dev_ctx, transformed_input, - transformed_output_grad, strides, paddings, - dilations, filter_grad); + depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, + paddings, dilations, filter_grad, data_layout); } } } diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu index 376cefe50258bf788404ddf63854f2ac84f327e5..c4cd5854c0f78ae970ee953f9cb46f5c1b840630 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu @@ -202,7 +202,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { int iwo_groups = groups; int c_groups = 1; -#if CUDNN_VERSION_MIN(7, 0, 1) +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) iwo_groups = 1; c_groups = groups; groups = 1; @@ -244,13 +244,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP using search = SearchAlgorithm; + workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args)); + algo = search::Find(args, false, deterministic, workspace_size, ctx); #else using search = SearchAlgorithm; -#endif - algo = search::Find(args, false, deterministic, ctx); workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args, algo)); +#endif // ------------------- cudnn conv transpose forward --------------------- int input_offset = @@ -451,7 +452,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { int iwo_groups = groups; int c_groups = 1; -#if CUDNN_VERSION_MIN(7, 0, 1) +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) iwo_groups = 1; c_groups = groups; groups = 1; @@ -489,10 +490,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { bool deterministic = FLAGS_cudnn_deterministic; T* input_grad_data = nullptr; T* filter_grad_data = nullptr; - if (input_grad) - input_grad_data = input_grad->mutable_data(ctx.GetPlace()); - if (filter_grad) - filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); if (input_grad) { input_grad_data = input_grad->mutable_data(ctx.GetPlace()); @@ -504,12 +501,16 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { platform::AllowTF32Cudnn(), c_groups); #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search1::GetWorkspaceSize(args1)); + data_algo = + search1::Find(args1, false, deterministic, workspace_size, ctx); #else using search1 = SearchAlgorithm; -#endif data_algo = search1::Find(args1, false, deterministic, ctx); workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); +#endif } if (filter_grad) { @@ -522,12 +523,16 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { platform::AllowTF32Cudnn(), c_groups); #ifdef PADDLE_WITH_HIP using search2 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search2::GetWorkspaceSize(args2)); + filter_algo = + search2::Find(args2, false, deterministic, workspace_size, ctx); #else using search2 = SearchAlgorithm; -#endif filter_algo = search2::Find(args2, false, deterministic, ctx); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2, filter_algo)); +#endif } // ------------------- cudnn conv backward data --------------------- @@ -875,7 +880,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { int iwo_group = groups; int c_group = 1; -#if CUDNN_VERSION_MIN(7, 0, 1) +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) iwo_group = 1; c_group = groups; groups = 1; @@ -939,14 +944,18 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { args1.idesc.set(transformed_ddO_channel, iwo_group); args1.wdesc.set(*W, layout, iwo_group); args1.odesc.set(transformed_ddX, iwo_group); - args1.cdesc.set(dtype, padding_common, strides, dilations, c_group); + args1.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_group); #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; + workspace_size = search1::GetWorkspaceSize(args1); + bwd_algo1 = + search1::Find(args1, false, deterministic, workspace_size, ctx); #else using search1 = SearchAlgorithm; -#endif bwd_algo1 = search1::Find(args1, false, deterministic, ctx); workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1); +#endif } if (ddW) { @@ -955,15 +964,20 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { args2.idesc.set(transformed_ddO_channel, iwo_group); args2.wdesc.set(*ddW, layout, iwo_group); args2.odesc.set(transformed_X, iwo_group); - args2.cdesc.set(dtype, padding_common, strides, dilations, c_group); + args2.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_group); #ifdef PADDLE_WITH_HIP using search2 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search2::GetWorkspaceSize(args2)); + bwd_algo2 = + search2::Find(args2, false, deterministic, workspace_size, ctx); #else using search2 = SearchAlgorithm; -#endif bwd_algo2 = search2::Find(args2, false, deterministic, ctx); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2, bwd_algo2)); +#endif } } @@ -975,15 +989,20 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { args3.odesc.set(transformed_ddX_channel, iwo_group); - args3.cdesc.set(dtype, padding_common, strides, dilations, c_group); + args3.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_group); #ifdef PADDLE_WITH_HIP using search3 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search3::GetWorkspaceSize(args3)); + filter_algo = + search3::Find(args3, false, deterministic, workspace_size, ctx); #else using search3 = SearchAlgorithm; -#endif filter_algo = search3::Find(args3, false, deterministic, ctx); workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo)); +#endif } if (ddW && dX) { @@ -993,15 +1012,20 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { args4.idesc.set(transformed_dO, iwo_group); args4.wdesc.set(*ddW, layout, iwo_group); args4.odesc.set(transformed_dX_channel, iwo_group); - args4.cdesc.set(dtype, padding_common, strides, dilations, c_group); + args4.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_group); #ifdef PADDLE_WITH_HIP using search4 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search4::GetWorkspaceSize(args4)); + data_algo = + search4::Find(args4, false, deterministic, workspace_size, ctx); #else using search4 = SearchAlgorithm; -#endif data_algo = search4::Find(args4, false, deterministic, ctx); workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); +#endif } int i_n, i_c, i_d, i_h, i_w; @@ -1059,6 +1083,10 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { if (ddW) { for (int i = 0; i < groups; i++) { #ifdef PADDLE_WITH_HIP + // MIOPEN ONLY support beta to be 0.0f + Tensor conv_x_ddw(dO->type()); + conv_x_ddw.Resize(transformed_ddO_channel.dims()); + T* conv_x_ddw_data = conv_x_ddw.mutable_data(ctx.GetPlace()); wkspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -1066,11 +1094,17 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { handle, &alpha, args2.odesc.desc(), x + i * group_offset_in, args2.wdesc.desc(), ddw + i * group_offset_filter, args2.cdesc.desc(), - bwd_algo2, &alpha, args2.idesc.desc(), - transformed_ddy_channel + i * group_offset_out, - workspace_ptr, workspace_size)); + bwd_algo2, &beta, args2.idesc.desc(), + conv_x_ddw_data + i * group_offset_out, workspace_ptr, + workspace_size)); }, workspace_size); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor( + handle, miopenTensorOpAdd, &alpha, args2.idesc.desc(), + transformed_ddy_channel + i * group_offset_out, &alpha, + args2.idesc.desc(), conv_x_ddw_data + i * group_offset_out, &beta, + args2.idesc.desc(), + transformed_ddy_channel + i * group_offset_out)); #else // PADDLE_WITH_HIP wkspace_handle.RunFunc( [&](void* workspace_ptr) { diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h index 651719f1052806ad356f2bc8fd4c2f3a0abe210b..ecf5b6d774a2605c06bbeb2514c981b46e7f6a0d 100644 --- a/paddle/fluid/operators/conv_transpose_op.h +++ b/paddle/fluid/operators/conv_transpose_op.h @@ -682,9 +682,9 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel { if (input_grad) { math::DepthwiseConvFunctor depthwiseConv; depthwiseConv( - dev_ctx, *output_grad, filter, strides, paddings, + dev_ctx, *output_grad, filter, strides, std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - input_grad, data_layout); + dilations, input_grad, data_layout); } if (filter_grad) { diff --git a/paddle/fluid/operators/copy_cross_scope_op.cc b/paddle/fluid/operators/copy_cross_scope_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..721354954c70355e18d330ea458101a23d9cb401 --- /dev/null +++ b/paddle/fluid/operators/copy_cross_scope_op.cc @@ -0,0 +1,151 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/var_type_traits.h" + +namespace paddle { +namespace framework { +class OpDesc; +template +class EmptyGradOpMaker; +} // namespace framework +namespace imperative { +class OpBase; +} // namespace imperative +} // namespace paddle + +using LoDTensor = paddle::framework::LoDTensor; +using Tensor = paddle::framework::Tensor; + +namespace paddle { +namespace operators { + +class CopyCrossScopeOp : public framework::OperatorBase { + public: + CopyCrossScopeOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext* ctx) const {} + + private: + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + int num_micro_scopes = scope.kids().size(); + int num_micro_batches = Attr("num_micro_batches"); + bool ToM = Attr("to_main_scope"); + PADDLE_ENFORCE_EQ(num_micro_scopes, num_micro_batches, + platform::errors::InvalidArgument( + "For pipeline, number of micro scopes (%d) should " + "be equal to number of micro batches (%d).", + num_micro_scopes, num_micro_batches)); + const std::string& id_name = Input("Id"); + auto* id_var = scope.FindVar(id_name); + PADDLE_ENFORCE_NOT_NULL( + id_var, + platform::errors::NotFound("No variable with name %s found.", id_name)); + auto id_tensor = id_var->GetMutable(); + auto it = scope.kids().begin(); + framework::Tensor cpu_id_tensor; + TensorCopySync(*id_tensor, platform::CPUPlace(), &cpu_id_tensor); + auto id_value = cpu_id_tensor.data(); + for (auto i = 0; i < *id_value; i++) { + it++; + } + if (it == scope.kids().end()) { + if (ToM) { + auto dst_scope = *it; + const std::string& x_name = Input("X"); + auto* dst_var = dst_scope->FindVar(x_name); + PADDLE_ENFORCE_NOT_NULL( + dst_var, + platform::errors::NotFound( + "No variable with name %s found in source scope.", x_name)); + auto* main_var = scope.FindVar(x_name); + PADDLE_ENFORCE_NOT_NULL( + main_var, + platform::errors::NotFound( + "No variable with name %s found in destination scope.", + x_name)); + auto dst_tensor = dst_var->GetMutable(); + auto main_tensor = main_var->GetMutable(); + TensorCopySync(*dst_tensor, main_tensor->place(), main_tensor); + } + return; + } + auto source_scope = *it; + it++; + auto dst_scope = *it; + const std::string& x_name = Input("X"); + auto* source_var = source_scope->FindVar(x_name); + PADDLE_ENFORCE_NOT_NULL( + source_var, + platform::errors::NotFound( + "No variable with name %s found in source scope.", x_name)); + auto* dst_var = dst_scope->FindVar(x_name); + PADDLE_ENFORCE_NOT_NULL( + dst_var, + platform::errors::NotFound( + "No variable with name %s found in destination scope.", x_name)); + auto src_tensor = source_var->GetMutable(); + auto dst_tensor = dst_var->GetMutable(); + TensorCopySync(*src_tensor, dst_tensor->place(), dst_tensor); + + if (ToM) { + auto* main_var = scope.FindVar(x_name); + PADDLE_ENFORCE_NOT_NULL( + main_var, + platform::errors::NotFound( + "No variable with name %s found in destination scope.", x_name)); + auto main_tensor = main_var->GetMutable(); + TensorCopySync(*dst_tensor, main_tensor->place(), main_tensor); + } + } +}; + +class CopyCrossScopeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor), The first input tensor of copy_cross_scope op, which " + "is copying micro scope."); + AddInput("Id", + "(Tensor), The second input tensor of copy_cross_scope op, which " + "is a id of the current micro scope."); + AddAttr("to_main_scope", "Return current scope to main scope.") + .SetDefault(false); + AddAttr("num_micro_batches", "Number of micro batches for pipeline."); + AddComment(R"DOC( + This op is used by pipeline to copy tensors across micro batch scopes. + Copy the variable value of the giving Id's micro scope to the micro scope of Id + 1 position. + If need to copy back to the main scope, using to_main_scope option to copy the variable value of + the current micro scope to the main scope. + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_WITHOUT_GRADIENT(copy_cross_scope, ops::CopyCrossScopeOp, + ops::CopyCrossScopeOpMaker); diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..e175b235f9c1816df8d0b4bb4cc0740778288cef --- /dev/null +++ b/paddle/fluid/operators/copy_cross_scope_test.cc @@ -0,0 +1,154 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/copy_cross_scope_op.cc" +#include "paddle/fluid/string/printf.h" + +#define Conn(x, y) x##y + +namespace f = paddle::framework; +namespace p = paddle::platform; + +USE_NO_KERNEL_OP(copy_cross_scope); + +template +void Compare1(f::Scope* scope, const p::DeviceContext& ctx, + std::string op_type) { + // init + auto var_x = scope->Var("tmp"); + auto x = var_x->GetMutable(); + std::vector main_x = {1.0}; + TensorFromVector(main_x, ctx, x); + + auto var_id = scope->Var("Id"); + auto id = var_id->GetMutable(); + std::vector main_id = {1}; + TensorFromVector(main_id, ctx, id); + for (int i = 0; i < 3; i++) { + auto& child_scope = scope->NewScope(); + auto child_var = child_scope.Var("tmp"); + auto tensor_x = child_var->GetMutable(); + std::vector init_x = {static_cast(i)}; + TensorFromVector(init_x, ctx, tensor_x); + } + + ctx.Wait(); + + // run + f::AttributeMap attrs = {{"to_main_scope", false}, {"num_micro_batches", 3}}; + std::map> output; + auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}}, + output, attrs); + + auto place = ctx.GetPlace(); + op->Run(*scope, place); + ctx.Wait(); + + std::list::const_iterator iter = scope->kids().begin(); + iter++; + iter++; + + auto* kid_scope = *iter; + auto* dst_var = kid_scope->FindVar("tmp"); + auto* tensor_out = dst_var->GetMutable(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + int expected = 1; + EXPECT_EQ(static_cast(out_vec[0]), expected); +} + +template +void Compare2(f::Scope* scope, const p::DeviceContext& ctx, + std::string op_type) { + // init + auto var_x = scope->Var("tmp"); + auto x = var_x->GetMutable(); + std::vector main_x = {1.0}; + TensorFromVector(main_x, ctx, x); + + auto var_id = scope->Var("Id"); + auto id = var_id->GetMutable(); + std::vector main_id = {0}; + TensorFromVector(main_id, ctx, id); + for (int i = 0; i < 3; i++) { + auto& child_scope = scope->NewScope(); + auto child_var = child_scope.Var("tmp"); + auto tensor_x = child_var->GetMutable(); + std::vector init_x = {static_cast(i)}; + TensorFromVector(init_x, ctx, tensor_x); + } + + ctx.Wait(); + + // run + f::AttributeMap attrs = {{"to_main_scope", true}, {"num_micro_batches", 3}}; + std::map> output; + auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}}, + output, attrs); + + auto place = ctx.GetPlace(); + op->Run(*scope, place); + ctx.Wait(); + + auto* dst_var = scope->FindVar("tmp"); + auto* tensor_out = dst_var->GetMutable(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + int expected = 0; + EXPECT_EQ(static_cast(out_vec[0]), expected); +} + +#ifdef PADDLE_WITH_CUDA +TEST(copy_cross_scope, CUDA_fp32) { + f::Scope scope; + p::CUDADeviceContext ctx(p::CUDAPlace(0)); + Compare1(&scope, ctx, "copy_cross_scope"); +} + +TEST(copy_cross_scope_to_main_scope, CUDA_fp32) { + f::Scope scope; + p::CUDADeviceContext ctx(p::CUDAPlace(0)); + Compare2(&scope, ctx, "copy_cross_scope"); +} +#elif PADDLE_WITH_ASCEND_CL +TEST(copy_cross_scope, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare1(&scope, ctx, "copy_cross_scope"); +} + +TEST(copy_cross_scope_to_main_scope, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare2(&scope, ctx, "copy_cross_scope"); +} +#endif diff --git a/paddle/fluid/operators/correlation_op.cu b/paddle/fluid/operators/correlation_op.cu index a51fce8132418b09c8f2db397fc83c8c69a8a429..9b08f875bb6e6d2e7f8dd1a8cdb142198438e34e 100644 --- a/paddle/fluid/operators/correlation_op.cu +++ b/paddle/fluid/operators/correlation_op.cu @@ -12,17 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef PADDLE_WITH_HIP -// HIP not supported yet - #include #include #include "paddle/fluid/framework/op_registry.h" +#ifdef __HIPCC__ +#define __syncwarp() __all(1) +#endif + namespace paddle { namespace operators { +#ifdef __HIPCC__ +#define THREADS_PER_BLOCK 64 +#else #define THREADS_PER_BLOCK 32 +#endif #define FULL_MASK 0xffffffff using framework::Tensor; @@ -30,14 +35,22 @@ using framework::Tensor; template __forceinline__ __device__ T warpReduceSum(T val) { for (int offset = 16; offset > 0; offset /= 2) { +#ifdef __HIPCC__ + val += __shfl_down(val, offset); +#else val += __shfl_down_sync(FULL_MASK, val, offset); +#endif } return val; } template __forceinline__ __device__ T blockReduceSum(T val) { +#ifdef __HIPCC__ + static __shared__ T shared[64]; +#else static __shared__ T shared[32]; +#endif int lane = threadIdx.x % warpSize; int wid = threadIdx.x / warpSize; @@ -483,5 +496,3 @@ REGISTER_OP_CUDA_KERNEL(correlation, ops::CorrelationCUDAKernel, ops::CorrelationCUDAKernel); REGISTER_OP_CUDA_KERNEL(correlation_grad, ops::CorrelationCUDAGradKernel, ops::CorrelationCUDAGradKernel); - -#endif // not PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/cudnn_lstm_cache.h b/paddle/fluid/operators/cudnn_lstm_cache.h index 3181e4b1d990b775f2c80a5d13f391886f83b080..b7859237e737a1af6bfc00c2fd6b5a7b610374e4 100644 --- a/paddle/fluid/operators/cudnn_lstm_cache.h +++ b/paddle/fluid/operators/cudnn_lstm_cache.h @@ -85,20 +85,12 @@ class ScopedRNNBase { dropout_desc_.descriptor(handle, place, initialized_, dropout_prob_, dropout_state, seed_, state_size); -// ------------------- cudnn rnn descriptors --------------------- -#if CUDNN_VERSION >= 6000 + // ------------------- cudnn rnn descriptors --------------------- PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6( handle, rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(), CUDNN_LINEAR_INPUT, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, CUDNN_RNN_ALGO_STANDARD, cudnn_type)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor( - rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(), - CUDNN_LINEAR_INPUT, - is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, - cudnn_type)); -#endif #if CUDNN_VERSION >= 7201 if (!sequence_length.empty()) { diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h index 13a3e7d09b9f628f31bb9ff3b6137acf6d929c5c..a6a23a91c76c02ec656ad1d13aa41c1d3d93c8fd 100644 --- a/paddle/fluid/operators/cudnn_rnn_cache.h +++ b/paddle/fluid/operators/cudnn_rnn_cache.h @@ -168,18 +168,11 @@ struct CudnnRNNCache { PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); -#if CUDNN_VERSION >= 6000 PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6( handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, CUDNN_RNN_ALGO_STANDARD, cudnn_type)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor( - rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, - is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, - cudnn_type)); -#endif PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); diff --git a/paddle/fluid/operators/decode_jpeg_op.cc b/paddle/fluid/operators/decode_jpeg_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..dd82c74885b9496bf64729a74a6527e68c80faf6 --- /dev/null +++ b/paddle/fluid/operators/decode_jpeg_op.cc @@ -0,0 +1,113 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { + +template +class CPUDecodeJpegKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // TODO(LieLinJiang): add cpu implement. + PADDLE_THROW(platform::errors::Unimplemented( + "DecodeJpeg op only supports GPU now.")); + } +}; + +class DecodeJpegOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DecodeJpeg"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DecodeJpeg"); + + auto mode = ctx->Attrs().Get("mode"); + std::vector out_dims; + + if (mode == "unchanged") { + out_dims = {-1, -1, -1}; + } else if (mode == "gray") { + out_dims = {1, -1, -1}; + } else if (mode == "rgb") { + out_dims = {3, -1, -1}; + } else { + PADDLE_THROW(platform::errors::Fatal( + "The provided mode is not supported for JPEG files on GPU: ", mode)); + } + + ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const { + if (var_name == "X") { + return expected_kernel_type; + } + + return framework::OpKernelType(tensor.type(), tensor.place(), + tensor.layout()); + } +}; + +class DecodeJpegOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "A one dimensional uint8 tensor containing the raw bytes " + "of the JPEG image. It is a tensor with rank 1."); + AddOutput("Out", "The output tensor of DecodeJpeg op"); + AddComment(R"DOC( +This operator decodes a JPEG image into a 3 dimensional RGB Tensor +or 1 dimensional Gray Tensor. Optionally converts the image to the +desired format. The values of the output tensor are uint8 between 0 +and 255. +)DOC"); + AddAttr( + "mode", + "(string, default \"unchanged\"), The read mode used " + "for optionally converting the image, can be \"unchanged\" " + ",\"gray\" , \"rgb\" .") + .SetDefault("unchanged"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR( + decode_jpeg, ops::DecodeJpegOp, ops::DecodeJpegOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker) + +REGISTER_OP_CPU_KERNEL(decode_jpeg, ops::CPUDecodeJpegKernel) diff --git a/paddle/fluid/operators/decode_jpeg_op.cu b/paddle/fluid/operators/decode_jpeg_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..11616b0e0c4daced68e8faf16a319d0c40f66244 --- /dev/null +++ b/paddle/fluid/operators/decode_jpeg_op.cu @@ -0,0 +1,138 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP) + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/dynload/nvjpeg.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/stream/cuda_stream.h" + +namespace paddle { +namespace operators { + +static cudaStream_t nvjpeg_stream = nullptr; +static nvjpegHandle_t nvjpeg_handle = nullptr; + +void InitNvjpegImage(nvjpegImage_t* img) { + for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++) { + img->channel[c] = nullptr; + img->pitch[c] = 0; + } +} + +template +class GPUDecodeJpegKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // Create nvJPEG handle + if (nvjpeg_handle == nullptr) { + nvjpegStatus_t create_status = + platform::dynload::nvjpegCreateSimple(&nvjpeg_handle); + + PADDLE_ENFORCE_EQ(create_status, NVJPEG_STATUS_SUCCESS, + platform::errors::Fatal("nvjpegCreateSimple failed: ", + create_status)); + } + + nvjpegJpegState_t nvjpeg_state; + nvjpegStatus_t state_status = + platform::dynload::nvjpegJpegStateCreate(nvjpeg_handle, &nvjpeg_state); + + PADDLE_ENFORCE_EQ(state_status, NVJPEG_STATUS_SUCCESS, + platform::errors::Fatal("nvjpegJpegStateCreate failed: ", + state_status)); + + int components; + nvjpegChromaSubsampling_t subsampling; + int widths[NVJPEG_MAX_COMPONENT]; + int heights[NVJPEG_MAX_COMPONENT]; + + auto* x = ctx.Input("X"); + auto* x_data = x->data(); + + nvjpegStatus_t info_status = platform::dynload::nvjpegGetImageInfo( + nvjpeg_handle, x_data, (size_t)x->numel(), &components, &subsampling, + widths, heights); + + PADDLE_ENFORCE_EQ( + info_status, NVJPEG_STATUS_SUCCESS, + platform::errors::Fatal("nvjpegGetImageInfo failed: ", info_status)); + + int width = widths[0]; + int height = heights[0]; + + nvjpegOutputFormat_t output_format; + int output_components; + + auto mode = ctx.Attr("mode"); + if (mode == "unchanged") { + if (components == 1) { + output_format = NVJPEG_OUTPUT_Y; + output_components = 1; + } else if (components == 3) { + output_format = NVJPEG_OUTPUT_RGB; + output_components = 3; + } else { + platform::dynload::nvjpegJpegStateDestroy(nvjpeg_state); + PADDLE_THROW(platform::errors::Fatal( + "The provided mode is not supported for JPEG files on GPU")); + } + } else if (mode == "gray") { + output_format = NVJPEG_OUTPUT_Y; + output_components = 1; + } else if (mode == "rgb") { + output_format = NVJPEG_OUTPUT_RGB; + output_components = 3; + } else { + platform::dynload::nvjpegJpegStateDestroy(nvjpeg_state); + PADDLE_THROW(platform::errors::Fatal( + "The provided mode is not supported for JPEG files on GPU")); + } + + nvjpegImage_t out_image; + InitNvjpegImage(&out_image); + + // create nvjpeg stream + if (nvjpeg_stream == nullptr) { + cudaStreamCreateWithFlags(&nvjpeg_stream, cudaStreamNonBlocking); + } + + int sz = widths[0] * heights[0]; + + auto* out = ctx.Output("Out"); + std::vector out_shape = {output_components, height, width}; + out->Resize(framework::make_ddim(out_shape)); + + T* data = out->mutable_data(ctx.GetPlace()); + + for (int c = 0; c < output_components; c++) { + out_image.channel[c] = data + c * sz; + out_image.pitch[c] = width; + } + + nvjpegStatus_t decode_status = platform::dynload::nvjpegDecode( + nvjpeg_handle, nvjpeg_state, x_data, x->numel(), output_format, + &out_image, nvjpeg_stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(decode_jpeg, ops::GPUDecodeJpegKernel) + +#endif diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cu b/paddle/fluid/operators/detection/anchor_generator_op.cu index b4c27a63dbd2f2fdbd9b018aa1606a79d5b0002d..388b8531571086f8ba464a51a4ebce7fb816bc31 100644 --- a/paddle/fluid/operators/detection/anchor_generator_op.cu +++ b/paddle/fluid/operators/detection/anchor_generator_op.cu @@ -49,14 +49,11 @@ __global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num, anchor_width = scale_w * base_w; anchor_height = scale_h * base_h; - T xmin = (x_ctr - 0.5 * (anchor_width - 1)); - T ymin = (y_ctr - 0.5 * (anchor_height - 1)); - T xmax = (x_ctr + 0.5 * (anchor_width - 1)); - T ymax = (y_ctr + 0.5 * (anchor_height - 1)); - out[i * 4] = xmin; - out[i * 4 + 1] = ymin; - out[i * 4 + 2] = xmax; - out[i * 4 + 3] = ymax; + T xmin = (x_ctr - .5f * (anchor_width - 1)); + T ymin = (y_ctr - .5f * (anchor_height - 1)); + T xmax = (x_ctr + .5f * (anchor_width - 1)); + T ymax = (y_ctr + .5f * (anchor_height - 1)); + reinterpret_cast(out)[i] = make_float4(xmin, ymin, xmax, ymax); } } diff --git a/paddle/fluid/operators/detection/anchor_generator_op.h b/paddle/fluid/operators/detection/anchor_generator_op.h index e0e499d76a19ba5f6b91ba4c8797684fb53c7caa..599f6935736f946bc021cf70177a45ed2b9679e3 100644 --- a/paddle/fluid/operators/detection/anchor_generator_op.h +++ b/paddle/fluid/operators/detection/anchor_generator_op.h @@ -22,6 +22,19 @@ limitations under the License. */ namespace paddle { namespace operators { +#ifdef PADDLE_WITH_CUDA +template +extern __global__ void GenAnchors(T* out, const T* aspect_ratios, + const int ar_num, const T* anchor_sizes, + const int as_num, const T* stride, + const int sd_num, const int height, + const int width, const T offset); + +template +extern __global__ void SetVariance(T* out, const T* var, const int vnum, + const int num); +#endif + template class AnchorGeneratorOpKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h index 27852d43948327c498e73a51e4151a25c31f64c3..725983f8153e4f14f41552ae26800d363a863ec1 100644 --- a/paddle/fluid/operators/detection/bbox_util.cu.h +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -66,7 +66,8 @@ static void SortDescending(const platform::CUDADeviceContext &ctx, // Determine temporary device storage requirements size_t temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairsDescending( - nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num); + nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num, 0, + sizeof(T) * 8, ctx.stream()); // Allocate temporary storage auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); @@ -74,7 +75,7 @@ static void SortDescending(const platform::CUDADeviceContext &ctx, // Run sorting operation cub::DeviceRadixSort::SortPairsDescending( d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in, - idx_out, num); + idx_out, num, 0, sizeof(T) * 8, ctx.stream()); } template @@ -275,15 +276,19 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, const T *boxes = proposals.data(); auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); - framework::Vector mask(boxes_num * col_blocks); - NMSKernel<<>>(boxes_num, nms_threshold, boxes, - mask.CUDAMutableData(BOOST_GET_CONST( - platform::CUDAPlace, ctx.GetPlace())), - pixel_offset); + auto mask_ptr = memory::Alloc(ctx, boxes_num * col_blocks * sizeof(uint64_t)); + uint64_t *mask_dev = reinterpret_cast(mask_ptr->ptr()); + + NMSKernel<<>>( + boxes_num, nms_threshold, boxes, mask_dev, pixel_offset); std::vector remv(col_blocks); memset(&remv[0], 0, sizeof(uint64_t) * col_blocks); + std::vector mask_host(boxes_num * col_blocks); + memory::Copy(platform::CPUPlace(), mask_host.data(), place, mask_dev, + boxes_num * col_blocks * sizeof(uint64_t), ctx.stream()); + std::vector keep_vec; int num_to_keep = 0; for (int i = 0; i < boxes_num; i++) { @@ -293,7 +298,7 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, if (!(remv[nblock] & (1ULL << inblock))) { ++num_to_keep; keep_vec.push_back(i); - uint64_t *p = &mask[0] + i * col_blocks; + uint64_t *p = mask_host.data() + i * col_blocks; for (int j = nblock; j < col_blocks; j++) { remv[j] |= p[j]; } diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index bc74c80e0315fac6de3ca575d53b23965adf4179..ffd9ac6b2af806ed92bea6484a077cc83de7af3c 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -144,7 +144,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { size_t temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairsDescending( nullptr, temp_storage_bytes, concat_scores.data(), keys_out, idx_in, - idx_out, total_roi_num); + idx_out, total_roi_num, 0, sizeof(T) * 8, dev_ctx.stream()); // Allocate temporary storage auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); @@ -152,7 +152,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { // sort score to get corresponding index cub::DeviceRadixSort::SortPairsDescending( d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data(), - keys_out, idx_in, idx_out, total_roi_num); + keys_out, idx_in, idx_out, total_roi_num, 0, sizeof(T) * 8, + dev_ctx.stream()); index_out_t.Resize({real_post_num}); Tensor sorted_rois; sorted_rois.mutable_data({real_post_num, kBBoxSize}, dev_ctx.GetPlace()); @@ -176,7 +177,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairs( nullptr, temp_storage_bytes, sorted_batch_id.data(), out_id_data, - batch_idx_in, index_out_t.data(), real_post_num); + batch_idx_in, index_out_t.data(), real_post_num, 0, + sizeof(int) * 8, dev_ctx.stream()); // Allocate temporary storage d_temp_storage = memory::Alloc(place, temp_storage_bytes); @@ -184,7 +186,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { // sort batch_id to get corresponding index cub::DeviceRadixSort::SortPairs( d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data(), - out_id_data, batch_idx_in, index_out_t.data(), real_post_num); + out_id_data, batch_idx_in, index_out_t.data(), real_post_num, 0, + sizeof(int) * 8, dev_ctx.stream()); GPUGather(dev_ctx, sorted_rois, index_out_t, fpn_rois); @@ -198,8 +201,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { int threads = kNumCUDAThreads; // get length-based lod by batch ids - GetLengthLoD<<>>(real_post_num, out_id_data, - length_lod_data); + GetLengthLoD<<>>( + real_post_num, out_id_data, length_lod_data); std::vector length_lod_cpu(lod_size); memory::Copy(platform::CPUPlace(), length_lod_cpu.data(), place, length_lod_data, sizeof(int) * lod_size, dev_ctx.stream()); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu index cc61035309eaab31534119ab088bf537bf71c242..7ccb354e1773a35957f9df97d26d79d50cbd4fd6 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -131,11 +131,10 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { int dist_blocks = NumBlocks(roi_num); int threads = kNumCUDAThreads; // get target levels and sub_lod list - GPUDistFpnProposalsHelper<<>>( + GPUDistFpnProposalsHelper<<>>( roi_num, fpn_rois->data(), lod_size, refer_level, refer_scale, max_level, min_level, roi_batch_id_list_gpu.data(), sub_lod_list_data, target_lvls_data, pixel_offset); - dev_ctx.Wait(); auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()); Tensor index_in_t; @@ -150,9 +149,9 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { // Determine temporary device storage requirements size_t temp_storage_bytes = 0; - cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, - target_lvls_data, keys_out, - idx_in, idx_out, roi_num); + cub::DeviceRadixSort::SortPairs( + nullptr, temp_storage_bytes, target_lvls_data, keys_out, idx_in, + idx_out, roi_num, 0, sizeof(int) * 8, dev_ctx.stream()); // Allocate temporary storage auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); @@ -160,29 +159,30 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { // sort target level to get corresponding index cub::DeviceRadixSort::SortPairs( d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out, - idx_in, idx_out, roi_num); + idx_in, idx_out, roi_num, 0, sizeof(int) * 8, dev_ctx.stream()); int* restore_idx_data = restore_index->mutable_data({roi_num, 1}, dev_ctx.GetPlace()); // sort current index to get restore index cub::DeviceRadixSort::SortPairs( d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in, - restore_idx_data, roi_num); + restore_idx_data, roi_num, 0, sizeof(int) * 8, dev_ctx.stream()); int start = 0; auto multi_rois_num = ctx.MultiOutput("MultiLevelRoIsNum"); + std::vector sub_lod_list_cpu(lod_size * num_level); + memory::Copy(platform::CPUPlace(), sub_lod_list_cpu.data(), place, + sub_lod_list_data, sizeof(int) * lod_size * num_level, + dev_ctx.stream()); + dev_ctx.Wait(); + for (int i = 0; i < num_level; ++i) { Tensor sub_lod = sub_lod_list.Slice(i, i + 1); - int* sub_lod_data = sub_lod.data(); // transfer length-based lod to offset-based lod std::vector offset(1, 0); - std::vector sub_lod_cpu(lod_size); - memory::Copy(platform::CPUPlace(), sub_lod_cpu.data(), place, - sub_lod_data, sizeof(int) * lod_size, dev_ctx.stream()); - dev_ctx.Wait(); for (int j = 0; j < lod_size; ++j) { - offset.emplace_back(offset.back() + sub_lod_cpu[j]); + offset.emplace_back(offset.back() + sub_lod_list_cpu[i * lod_size + j]); } int sub_rois_num = offset.back(); diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu index 8359fbab519b36f58fbeaf02082f02a1372993fc..e8ab628db16bdd591adb670bafc5e05aeac8efed 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_op.cu @@ -198,7 +198,6 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel { memory::Copy(place, rpn_roi_probs_data + num_proposals, place, scores.data(), sizeof(T) * scores.numel(), dev_ctx.stream()); - dev_ctx.Wait(); num_proposals += proposals.dims()[0]; offset.emplace_back(num_proposals); tmp_num.push_back(proposals.dims()[0]); diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu index 337a76f9f976f852973c4dd413427890a2ef37b3..5977a434a6023f4463c9fb7ab1067f038747e44c 100644 --- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu +++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu @@ -45,7 +45,8 @@ class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE_EQ( platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("It must use CUDAPlace.")); + platform::errors::InvalidArgument( + "The polygon_box_transform operator needs to be executed on GPU.")); auto* in = ctx.Input("Input"); auto in_dims = in->dims(); const T* in_data = in->data(); diff --git a/paddle/fluid/operators/diag_embed_op.cu b/paddle/fluid/operators/diag_embed_op.cu index 2e03622e10f0f477cd0b66c0bcafe9f7bf29a0f8..7e3ab6be664cb92370d50688ab93f9462ec89463 100644 --- a/paddle/fluid/operators/diag_embed_op.cu +++ b/paddle/fluid/operators/diag_embed_op.cu @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/diag_embed_op.h" diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h index a2279e40623b4ba2f0421e73a6148b89eb970e71..6a34ef48a169dc5e31f845f9993eef721faf2e7c 100644 --- a/paddle/fluid/operators/dist_op.h +++ b/paddle/fluid/operators/dist_op.h @@ -167,6 +167,7 @@ static void DistGradFunction(const framework::ExecutionContext& context) { auto sign = (x_minux_y > static_cast(0)).template cast() * static_cast(1.0) + (x_minux_y < static_cast(0)).template cast() * static_cast(-1.0); + T epsilon = static_cast(1.0e-10f); // 1: Lp-norm(z), z = x-y, compute dz if (p == 0) { @@ -189,12 +190,14 @@ static void DistGradFunction(const framework::ExecutionContext& context) { // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout if (platform::is_cpu_place(context.GetPlace())) { grad_t.device(place) = - (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * + (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims)) + .pow(p - 1) * sign.eval() * out_grad_t.broadcast(out_bcast_dims); } else { grad_t.device(place) = - (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign * - out_grad_t.broadcast(out_bcast_dims); + (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims)) + .pow(p - 1) * + sign * out_grad_t.broadcast(out_bcast_dims); } } diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt deleted file mode 100644 index c9db6148bc45d44e8336fc662ae1b4337445f738..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ /dev/null @@ -1,76 +0,0 @@ -return() - -if(WITH_GRPC) - set(cc_generic_services "false") -else() - set(cc_generic_services "true") -endif() -configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY) - -cc_library(async_sparse_param_update_recorder SRCS async_sparse_param_update_recorder.cc DEPS enforce simple_threadpool) -cc_test(async_sparse_param_update_recorder_test SRCS async_sparse_param_update_recorder_test.cc DEPS async_sparse_param_update_recorder) - -cc_library(heart_beat_monitor SRCS heart_beat_monitor.cc DEPS enforce simple_threadpool) -cc_library(large_scale_kv SRCS large_scale_kv.cc DEPS enforce simple_threadpool device_context) -cc_test(heart_beat_monitor_test SRCS heart_beat_monitor_test.cc DEPS heart_beat_monitor) - -# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files -set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") -if(WITH_GRPC) - set(GRPC_DEPS grpc++_unsecure grpc_unsecure gpr zlib protobuf) - set(GRPC_SRCS grpc/grpc_client.cc grpc/grpc_server.cc grpc/grpc_serde.cc grpc/grpc_bytebuffer_stream.cc grpc/grpc_variable_response.cc) - grpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc - request_handler_impl.cc rpc_client.cc rpc_server.cc - variable_response.cc - collective_client.cc collective_server.cc - ${GRPC_SRCS} - PROTO send_recv.proto - DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS} async_sparse_param_update_recorder heart_beat_monitor large_scale_kv) - - set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS}) - - cc_test(grpc_serde_test SRCS grpc/grpc_serde_test.cc - DEPS ${RPC_DEPS} scope profiler math_function) - -else() - set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc) - set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc parameter_recv.cc communicator.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - - set(BRPC_DEPS brpc ssl crypto protobuf leveldb zlib) - - brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc - request_handler_impl.cc rpc_client.cc rpc_server.cc - variable_response.cc - collective_client.cc collective_server.cc - ${BRPC_SRCS} - PROTO send_recv.proto - DEPS lod_tensor selected_rows memory scope ${BRPC_DEPS}) - - set(RPC_DEPS sendrecvop_rpc ${BRPC_DEPS}) - cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc - DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_read_op) -endif() - - -cc_test(rpc_server_test SRCS rpc_server_test.cc - DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op checkpoint_notify_op scale_op ) -cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope) -cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory node) -cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory) -cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory) -cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv generator) -cc_test(communicator_test SRCS communicator_test.cc DEPS communicator) -if(WITH_GPU OR WITH_ROCM) - cc_test(collective_server_test SRCS collective_server_test.cc - DEPS sendrecvop_rpc executor ${RPC_DEPS} - selected_rows_functor scope math_function) -endif() -if(WITH_TESTING) - if(TEST rpc_server_test) - set_tests_properties(rpc_server_test PROPERTIES TIMEOUT 120) - endif() - if(TEST heart_beat_monitor_test) - set_tests_properties(heart_beat_monitor_test PROPERTIES TIMEOUT 120) - endif() -endif() diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h deleted file mode 100644 index 28a5f2ad6c7648668bc75c130ab317ba4ab93cc7..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h +++ /dev/null @@ -1,186 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include // NOLINT -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { -namespace distributed { - -class ConcurrentSet { - public: - ConcurrentSet() : pool_(new ::ThreadPool(1)) {} - ~ConcurrentSet() {} - - std::future Update(const std::vector& rows) { - auto task = [this, rows] { - if (VLOG_IS_ON(3)) { - std::ostringstream sstream; - sstream << "["; - for (auto& id : rows) { - sstream << id << ", "; - } - sstream << "]"; - VLOG(3) << "update ids -> " << sstream.str(); - } - for (auto row : rows) { - set_.insert(row); - } - }; - return pool_->enqueue(std::move(task)); - } - - std::future GetAndClear(std::vector* result) { - auto task = [this, &result] { - result->clear(); - for (auto& id : set_) { - result->push_back(id); - } - if (VLOG_IS_ON(3)) { - std::ostringstream sstream; - sstream << "["; - for (auto& id : *result) { - sstream << id << ", "; - } - sstream << "]"; - VLOG(3) << "result ids size: " << result->size() << " " - << sstream.str(); - } - set_.clear(); - }; - return pool_->enqueue(std::move(task)); - } - - private: - std::unordered_set set_; - std::unique_ptr<::ThreadPool> pool_{nullptr}; -}; - -class AsyncSparseParamUpdateRecorder { - using TrainerToRows = std::vector>; - - public: - AsyncSparseParamUpdateRecorder( - int trainer_num, - const std::unordered_map& grad_to_param) - : trainer_num_(trainer_num), grad_to_param_(grad_to_param) { - if (VLOG_IS_ON(3)) { - std::ostringstream sstream; - sstream << "["; - for (auto& item : grad_to_param) { - sstream << item.first << ":" << item.second << ", "; - } - sstream << "]"; - VLOG(3) << "trainer_num: " << trainer_num - << " grad_to_param_: " << sstream.str(); - } - for (auto& iter : grad_to_param) { - param_to_grad_[iter.second] = iter.first; - auto& param_name = iter.second; - param_to_updated_rows_[param_name] = TrainerToRows(); - auto& trainer_to_rows = param_to_updated_rows_[param_name]; - for (auto i = 0; i < trainer_num; ++i) { - trainer_to_rows.emplace_back(new ConcurrentSet()); - } - } - } - - ~AsyncSparseParamUpdateRecorder() = default; - - void Update(const std::string& grad_name, - const std::vector& update_rows) { - VLOG(3) << "update grad: " << grad_name - << " row size: " << update_rows.size(); - auto& param_name = grad_to_param_.at(grad_name); - auto& trainer_to_rows = param_to_updated_rows_.at(param_name); - - std::vector> fs; - for (auto& set : trainer_to_rows) { - fs.push_back(set->Update(update_rows)); - } - for (auto& f : fs) { - f.wait(); - } - } - - void GetAndClear(const std::string& param_name, int trainer_id, - std::vector* result) { - VLOG(3) << "GetAndClear param: " << param_name - << " for trainer: " << trainer_id; - PADDLE_ENFORCE_LT( - trainer_id, trainer_num_, - platform::errors::InvalidArgument( - "The value of trainer_id: %s should less than trainer_num: %s.", - trainer_id, trainer_num_)); - param_to_updated_rows_.at(param_name)[trainer_id] - ->GetAndClear(result) - .wait(); - } - - bool HasParam(const std::string& param_name) { - return param_to_grad_.find(param_name) != param_to_grad_.end(); - } - - bool HasGrad(const std::string& grad_name) { - return grad_to_param_.find(grad_name) != grad_to_param_.end(); - } - - private: - const int trainer_num_; - std::unordered_map grad_to_param_; - std::unordered_map param_to_grad_; - std::unordered_map param_to_updated_rows_; - - // init recorder - public: - static void Init( - int trainer_num, - const std::unordered_map& grad_to_param) { - InitImpl(trainer_num, grad_to_param); - } - - static AsyncSparseParamUpdateRecorder* GetInstance() { - return recorder_.get(); - } - - private: - // Init is called by GetInstance. - static void InitImpl( - int trainer_num, - const std::unordered_map& grad_to_param) { - if (recorder_ == nullptr) { - recorder_.reset( - new AsyncSparseParamUpdateRecorder(trainer_num, grad_to_param)); - } - } - - static std::once_flag init_flag_; - static std::unique_ptr recorder_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc deleted file mode 100644 index 2d78559625c91fadec1fbb282b08e542a07d964a..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" -#include -#include "gtest/gtest.h" - -namespace paddle { -namespace operators { -namespace distributed { - -TEST(ConcurrentSet, All) { - ConcurrentSet concurrent_set; - std::vector in1 = {1, 2, 3, 4}; - std::vector in2 = {2, 3, 5, 6}; - - std::vector> futures; - futures.push_back(concurrent_set.Update(in1)); - futures.push_back(concurrent_set.Update(in2)); - - for (auto &f : futures) { - f.wait(); - } - - std::unordered_set in; - std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin())); - std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin())); - - std::vector ret; - concurrent_set.GetAndClear(&ret).wait(); - - std::unordered_set out; - std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin())); - - EXPECT_EQ(in, out); - - concurrent_set.GetAndClear(&ret).wait(); - EXPECT_EQ(ret.size(), 0UL); -} - -TEST(AsyncSparseParamUpdateRecorder, All) { - std::unordered_map grad_to_param; - grad_to_param["grad1"] = "param1"; - grad_to_param["grad2"] = "param2"; - - int trainer_num = 10; - - AsyncSparseParamUpdateRecorder recorder(trainer_num, grad_to_param); - std::vector in1 = {1, 2, 3, 4}; - std::vector in2 = {2, 3, 5, 6}; - - std::unordered_set in; - std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin())); - std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin())); - - recorder.Update("grad1", in1); - recorder.Update("grad1", in2); - - EXPECT_TRUE(recorder.HasParam("param1")); - EXPECT_TRUE(recorder.HasParam("param2")); - EXPECT_FALSE(recorder.HasParam("param3")); - - EXPECT_TRUE(recorder.HasGrad("grad1")); - EXPECT_TRUE(recorder.HasGrad("grad2")); - EXPECT_FALSE(recorder.HasGrad("grad3")); - - std::vector ret; - EXPECT_ANY_THROW(recorder.GetAndClear("param1", trainer_num, &ret)); - - for (int i = 0; i < trainer_num; ++i) { - std::vector ret; - std::unordered_set out; - - recorder.GetAndClear("param1", i, &ret); - std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin())); - - EXPECT_EQ(in, out); - - recorder.GetAndClear("param1", i, &ret); - EXPECT_EQ(ret.size(), 0UL); - } -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc deleted file mode 100644 index b2a26089c868968734df35383f2ddf2ba512b137..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc +++ /dev/null @@ -1,462 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/brpc/brpc_client.h" -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" -#include "paddle/fluid/platform/profiler.h" - -namespace paddle { -namespace operators { -namespace distributed { - -DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds"); -DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)"); - -BRPCClient::~BRPCClient() { Wait(); } - -void HandleSendResponse(brpc::Controller* cntl, sendrecv::VoidMessage* response, - VarHandlePtr var_h, ChannelQueuePtr ch_ptr, - ChannelContextPtr ch_ctx, BRPCClient* cls) { - // std::unique_ptr makes sure cntl/response will be deleted before returning. - std::unique_ptr cntl_guard(cntl); - std::unique_ptr response_guard(response); - - // this channel can be used by other now. - ch_ptr->Push(ch_ctx); - - if (cntl->Failed()) { - PADDLE_THROW(platform::errors::Unavailable( - "Failed to send variable %s, error text is %s.", var_h->name(), - cntl->ErrorText())); - var_h->Finish(false); - cls->DecreaseReqCount(); - return; - } - var_h->Finish(true); - cls->DecreaseReqCount(); - - VLOG(4) << "HandleSendResponse from: " << cntl->remote_side() - << ", varname: " << var_h->name() - << ", latency: " << cntl->latency_us() << "us"; - VLOG(4) << "Finish HandleSendResponse"; -} - -VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string var_name_val = var_name; - const framework::Scope* p_scope = &scope; - const auto ch_ptr = GetChannel(ep_val); - const std::string method = kSendRPC; - VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); - - framework::AsyncIO([=] { - auto ch_ctx = ch_ptr->Pop(); - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); - cntl->set_timeout_ms(time_out); - - auto* var = p_scope->FindVar(var_name_val); - sendrecv::VariableMessage request; - distributed::SerializeToIOBuf(var_name_val, var, *p_ctx, &request, - &cntl->request_attachment(), "", false, - trainer_id_); - - google::protobuf::Closure* done = brpc::NewCallback( - &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - - platform::RecordRPCEvent record_event(method); - - ch_ctx->stub->SendVariable(cntl, &request, response, done); - - if (UNLIKELY(platform::IsProfileEnabled())) { - var_h->Wait(); - } - }); - req_count_++; - - return var_h; -} -void HandleFetchBarrierResponse(brpc::Controller* cntl, - sendrecv::VariableMessage* response, - VarHandlePtr var_h, ChannelQueuePtr ch_ptr, - ChannelContextPtr ch_ctx, BRPCClient* cls) { - // std::unique_ptr makes sure cntl/response will be deleted before returning. - std::unique_ptr cntl_guard(cntl); - std::unique_ptr response_guard(response); - - // this channel can be used other now. - ch_ptr->Push(ch_ctx); - - if (cntl->Failed()) { - PADDLE_THROW(platform::errors::Unavailable( - "Failed to get HandleFetchBarrierResponse %s, error text is %s.", - var_h->name(), cntl->ErrorText())); - var_h->Finish(false); - cls->DecreaseReqCount(); - return; - } - - var_h->Finish(true); - cls->DecreaseReqCount(); - - VLOG(4) << "HandleFetchBarrierResponse from: " << cntl->remote_side() - << ", varname: " << var_h->name() - << ", latency: " << cntl->latency_us() << "us"; - VLOG(4) << "Finish HandleFetchBarrierResponse"; -} -void HandleGetResponse(brpc::Controller* cntl, - sendrecv::VariableMessage* response, VarHandlePtr var_h, - ChannelQueuePtr ch_ptr, ChannelContextPtr ch_ctx, - BRPCClient* cls) { - // std::unique_ptr makes sure cntl/response will be deleted before returning. - std::unique_ptr cntl_guard(cntl); - std::unique_ptr response_guard(response); - - // this channel can be used other now. - ch_ptr->Push(ch_ctx); - - if (cntl->Failed()) { - PADDLE_THROW(platform::errors::Unavailable( - "Failed to get variable %s, error text is %s.", var_h->name(), - cntl->ErrorText())); - cls->DecreaseReqCount(); - var_h->Finish(false); - return; - } - - VLOG(4) << "HandleGetResponse from: " << cntl->remote_side() - << ", varname: " << var_h->name() - << ", latency: " << cntl->latency_us() << "us"; - - framework::Variable* outvar = nullptr; - int trainer_id; - distributed::DeserializeFromIOBuf(*response, cntl->response_attachment(), - *var_h->ctx(), var_h->scope(), &outvar, - &trainer_id); - VLOG(4) << "Finish HandleGetResponse"; - cls->DecreaseReqCount(); - var_h->Finish(true); -} - -VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_var_name, - const std::string& method_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string var_name_val = var_name; - const std::string out_varname_val = out_var_name; - const framework::Scope* p_scope = &scope; - const auto ch_ptr = GetChannel(ep_val); - const std::string method = kGetRPC; - VarHandlePtr var_h( - new VarHandle(ep, method, out_varname_val, p_ctx, p_scope)); - - framework::AsyncIO([=] { - auto ch_ctx = ch_ptr->Pop(); - - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); - cntl->set_timeout_ms(time_out); - - sendrecv::VariableMessage req; - req.set_varname(var_name_val); - req.set_out_varname(out_varname_val); - req.set_trainer_id(trainer_id_); - - google::protobuf::Closure* done = brpc::NewCallback( - &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - - platform::RecordRPCEvent record_event(method); - - if (method_name == kGetMonomerRPC) { - ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done); - } else if (method_name == kGetNoBarrierRPC) { - ch_ctx->stub->GetVariableNoBarrier(cntl, &req, response, done); - } else { - ch_ctx->stub->GetVariable(cntl, &req, response, done); - } - - if (UNLIKELY(platform::IsProfileEnabled())) { - var_h->Wait(); - } - }); - - req_count_++; - - return var_h; -} - -VarHandlePtr BRPCClient::AsyncGetVarNoBarrier( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - const std::string& out_var_name, int64_t time_out) { - std::string var_name_no_barrier = - string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE); - - return _AsyncGetVar(ep, ctx, scope, var_name_no_barrier, out_var_name, - kGetNoBarrierRPC, time_out); -} - -VarHandlePtr BRPCClient::AsyncGetMonomerVariable( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, var_name, var_name, kGetMonomerRPC, - time_out); -} - -VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep, - const std::string& var_name, - int64_t time_out) { - return AsyncSendMessage(ep, kSendMonomerFetchBarrierRPC, var_name, time_out); -} - -VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_var_name, - const std::string& table_name, - int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC, - time_out); -} - -VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - const std::string& table_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string in_var_name_val = in_var_name; - const std::string out_var_name_val = out_var_name; - const std::string table_name_val = table_name; - const framework::Scope* p_scope = &scope; - const auto ch_ptr = GetChannel(ep_val); - - const std::string method = kPrefetchRPC; - - VarHandlePtr var_h( - new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); - - framework::AsyncIO([=] { - auto ch_ctx = ch_ptr->Pop(); - - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); - cntl->set_timeout_ms(time_out); - - auto* var = p_scope->FindVar(in_var_name_val); - sendrecv::VariableMessage req; - distributed::SerializeToIOBuf(in_var_name_val, var, *p_ctx, &req, - &cntl->request_attachment(), out_var_name_val, - false, 0, table_name_val); - - platform::RecordRPCEvent record_event(method); - - google::protobuf::Closure* done = brpc::NewCallback( - &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - - ch_ctx->stub->PrefetchVariable(cntl, &req, response, done); - - if (UNLIKELY(platform::IsProfileEnabled())) { - var_h->Wait(); - } - }); - - req_count_++; - return var_h; -} - -VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out) { - return AsyncSendMessage(ep, kBatchBarrierRPC, BATCH_BARRIER_MESSAGE, - time_out); -} - -VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out) { - auto ch_ptr = GetChannel(ep); - auto ch_ctx = ch_ptr->Pop(); - - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); - cntl->set_timeout_ms(time_out); - - sendrecv::VariableMessage req; - req.set_varname(FETCH_BARRIER_MESSAGE); - - const std::string method = kFetchBarrierRPC; - // var handle - VarHandlePtr var_h( - new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); - - platform::RecordRPCEvent record_event(method); - - google::protobuf::Closure* done = brpc::NewCallback( - &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - - ch_ctx->stub->GetVariable(cntl, &req, response, done); - - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - var_h->Wait(); - } - - return var_h; -} - -bool BRPCClient::Wait() { - VLOG(9) << "begin to brpcclient wait"; - { - std::unique_lock lk(sync_mutex_); - sync_cond_.wait(lk, [this] { return req_count_ == 0; }); - } - VLOG(9) << "end to brpcclient wait"; - return true; -} - -ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { - VLOG(4) << "begin to GetChannel:" << ep; - { - std::lock_guard guard(chan_mutex_); - auto it = channels_.find(ep); - if (it != channels_.end()) { - VLOG(4) << "end to GetChannel:" << ep; - return it->second; - } - } - - ChannelQueuePtr q(new framework::BlockingQueue()); - - brpc::ChannelOptions options; -#ifdef PADDLE_WITH_BRPC_RDMA - options.use_rdma = true; -#endif - options.protocol = "baidu_std"; - // don't use pooled type. the server can't afford that. - options.connection_type = "single"; - options.connect_timeout_ms = 1000; - options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/; - options.max_retry = FLAGS_max_retry; - - VLOG(1) << "create " << brpc_channel_num_per_server_ - << " brpc channels to pserver:" << ep; - - for (int i = 0; i < brpc_channel_num_per_server_; ++i) { - std::shared_ptr c(new ChannelContext()); - if (c->channel.Init(ep.c_str(), &options) != 0) { - PADDLE_THROW( - platform::errors::Unavailable("Failed to initialize channel.")); - return nullptr; - } - - c->stub.reset(new sendrecv::SendRecvService_Stub( - static_cast(&c->channel))); - q->Push(c); - } - - { - std::lock_guard guard(chan_mutex_); - channels_[ep] = q; - } - - VLOG(4) << "end to GetChannel:" << ep; - return q; -} - -VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep, - int64_t time_out) { - return AsyncSendMessage(ep, kSendCompleteRPC, COMPLETE_MESSAGE, time_out); -} - -void BRPCClient::SendComplete() { - for (auto& kv : channels_) { - AsyncSendComplete(kv.first); - } -} - -VarHandlePtr BRPCClient::AsyncSendVarMessage( - const std::string& ep, const std::string& method_name, - const sendrecv::VariableMessage& req, int64_t time_out) { - auto ch_ptr = GetChannel(ep); - auto ch_ctx = ch_ptr->Pop(); - - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); - cntl->set_timeout_ms(time_out); - - platform::RecordRPCEvent record_event(method_name); - - VarHandlePtr var_h( - new VarHandle(ep, method_name, req.varname(), nullptr, nullptr)); - - google::protobuf::Closure* done = brpc::NewCallback( - &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - - if (method_name == kCheckPointNotifyRPC) { - ch_ctx->stub->CheckpointNotify(cntl, &req, response, done); - } else if (method_name == kSendMonomerFetchBarrierRPC) { - ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done); - } else { - ch_ctx->stub->SendVariable(cntl, &req, response, done); - } - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - var_h->Wait(); - } - - return var_h; -} - -VarHandlePtr BRPCClient::AsyncSendMessage(const std::string& ep, - const std::string& method_name, - const std::string& message, - int64_t time_out) { - sendrecv::VariableMessage req; - req.set_varname(message); - - return AsyncSendVarMessage(ep, method_name, req, time_out); -} - -VarHandlePtr BRPCClient::AsyncCheckpointNotify(const std::string& ep, - const std::string& dirname, - const std::string& varname, - const int mode, - int64_t time_out) { - sendrecv::VariableMessage req; - req.set_varname(varname); - req.set_out_varname(dirname); - - return AsyncSendVarMessage(ep, "CheckPointNotifyRPC", req, time_out); -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h deleted file mode 100644 index 91f94b4c9d5a3076e17b0b54cf153e2f3b9edc31..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_client.h +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include // NOLINT -#include -#include -#include -#include -#include -#include // NOLINT -#include -#include -#include - -#include "brpc/channel.h" -#include "paddle/fluid/framework/blocking_queue.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/operators/distributed/rpc_client.h" -#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN - -namespace paddle { -namespace operators { -namespace distributed { - -struct ChannelContext { - brpc::Channel channel; - std::shared_ptr stub; -}; - -typedef std::shared_ptr ChannelContextPtr; -typedef std::shared_ptr> - ChannelQueuePtr; - -class BRPCClient : public RPCClient { - public: - BRPCClient() {} - virtual ~BRPCClient(); - - VarHandlePtr AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_var_name, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetMonomerBarrier( - const std::string& ep, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetMonomerVariable( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetVarNoBarrier(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_varname, - int64_t time_out = FLAGS_rpc_deadline); - - VarHandlePtr AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendBatchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendFetchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncCheckpointNotify( - const std::string& ep, const std::string& dirname, - const std::string& varname, const int mode, - int64_t time_out = FLAGS_rpc_deadline) override; - - bool Wait() override; - - void SendComplete() override; - - private: - VarHandlePtr _AsyncGetVar( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - const std::string& out_var_name, const std::string& method_name, - const std::string& table_name, int64_t time_out = FLAGS_rpc_deadline); - - void Proceed(); - ChannelQueuePtr GetChannel(const std::string& ep); - - VarHandlePtr AsyncSendComplete(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline); - - VarHandlePtr AsyncSendMessage(const std::string& ep, - const std::string& method_name, - const std::string& message, int64_t time_out); - - VarHandlePtr AsyncSendVarMessage(const std::string& ep, - const std::string& method_name, - const sendrecv::VariableMessage& req, - int64_t time_out); - - friend void HandleSendResponse(brpc::Controller* cntl, - sendrecv::VoidMessage* response, - VarHandlePtr var_h, ChannelQueuePtr ch_ptr, - ChannelContextPtr ch_ctx, BRPCClient* cls); - - friend void HandleGetResponse(brpc::Controller* cntl, - sendrecv::VariableMessage* response, - VarHandlePtr var_h, ChannelQueuePtr ch_ptr, - ChannelContextPtr ch_ctx, BRPCClient* cls); - - friend void HandleFetchBarrierResponse(brpc::Controller* cntl, - sendrecv::VariableMessage* response, - VarHandlePtr var_h, - ChannelQueuePtr ch_ptr, - ChannelContextPtr ch_ctx, - BRPCClient* cls); - void DecreaseReqCount() { - if (--req_count_ <= 0) { - sync_cond_.notify_all(); - } - } - - private: - std::unordered_map channels_; - - // mutex for Wait client sync - std::mutex sync_mutex_; - std::condition_variable sync_cond_; - std::atomic req_count_{0}; - - static constexpr int brpc_channel_num_per_server_ = 4; - - // mutex for GetChannel thread safety - std::mutex chan_mutex_; - DISABLE_COPY_AND_ASSIGN(BRPCClient); -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc deleted file mode 100644 index 94f0b9919ace835e32d2e1a25e7738a9e88ed7bd..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifdef PADDLE_WITH_BRPC_RDMA - -#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h" -#include "brpc/channel.h" -#include "brpc/rdma/rdma_helper.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { -namespace distributed { - -RdmaMemPool& RdmaMemPool::Instance() { - static RdmaMemPool* g_rdma_mem_pool = new RdmaMemPool(); - return *g_rdma_mem_pool; -} - -void* RdmaMemPool::Find(const std::string& varname, int64_t size) { - pthread_rwlock_rdlock(&access_); - auto it = pool_.find(varname); - if (it == pool_.end()) { - pthread_rwlock_unlock(&access_); - return nullptr; - } - - auto info = it->second; - if (info.data_size != size) { - pthread_rwlock_unlock(&access_); - PADDLE_THROW(platform::errors::InvalidArgument( - "var:%s size:%ld != %ld", varname, size, info.data_size)); - return nullptr; - } - - pthread_rwlock_unlock(&access_); - return info.data; -} - -void RdmaMemPool::Register(const std::string& varname, void* data, - int64_t data_size) { - void* old = Find(varname, data_size); - if (old != nullptr) { - PADDLE_ENFORCE_EQ( - data, old, platform::errors::InvalidArgument("var:%s data:%ld != %ld", - varname, data, old)); - VLOG(7) << "Find on rdma:" << varname << " data:" << data - << " data_size:" << data_size; - return; - } - - VarInfo info; - info.data = data; - info.data_size = data_size; - - pthread_rwlock_wrlock(&access_); - pool_[varname] = info; - pthread_rwlock_unlock(&access_); - - if (brpc::rdma::RegisterMemoryForRdma(data, data_size)) { - PADDLE_THROW(platform::errors::Unavailable( - "Register memory for RDMA failed. Register %s data: %s data size %d " - "error.", - varname, data, data_size)); - } - - VLOG(4) << "register on rdma:" << varname << " data:" << data - << " data_size:" << data_size; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle - -#endif diff --git a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h deleted file mode 100644 index 156a93ec5784715c0a68c1af2e31d640dfc60277..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#ifdef PADDLE_WITH_BRPC_RDMA - -#include // NOLINT -#include -#include - -namespace paddle { -namespace operators { -namespace distributed { - -/* - * This class is used to avoid duplicated registion of brpc::rdma. - */ -class RdmaMemPool { - public: - static RdmaMemPool& Instance(); - RdmaMemPool() : access_(PTHREAD_RWLOCK_INITIALIZER) {} - - virtual ~RdmaMemPool() { pthread_rwlock_destroy(&access_); } - - void Register(const std::string& varname, void* data, int64_t size); - void* Find(const std::string& varname, int64_t size); - - private: - struct VarInfo { - void* data; - int64_t data_size; - - VarInfo() : data(nullptr), data_size(0) {} - }; - - private: - std::unordered_map pool_; - pthread_rwlock_t access_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle - -#endif diff --git a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc deleted file mode 100644 index 411c0f36debd3b66014648f3bcad395d1b4a3579..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc +++ /dev/null @@ -1,224 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_WITH_NCCL -#include -#endif -#ifdef PADDLE_WITH_RCCL -#include -#endif -#include -#include -#include -#include // NOLINT - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/platform/profiler.h" - -namespace paddle { -namespace operators { -namespace distributed { - -class IOBufWriter { - public: - static void Append(const std::string& varname, butil::IOBuf* iobuf, int k, - const char* v, int64_t vlen) { - if (vlen >= std::numeric_limits::max() || vlen < 0) { - PADDDLE_THROW(platform::errors::Unavailable( - "Variable lenght is invalid. Variable name is %s, length is %d.", - varname, vlen)); - } - - iobuf->append(reinterpret_cast(&k), 4); - iobuf->append(reinterpret_cast(&vlen), 8); - iobuf->append(v, vlen); - } - - static void AppendTCPZeroCopy(butil::IOBuf* iobuf, int k, const char* v, - int64_t vlen, bool in_cuda_pinned, - void (*destroy)(void*), void* user_data) { - VLOG(7) << "AppendTCPZeroCopy " - << " k:" << k - << " data:" << static_cast(const_cast(v)) - << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned; - - iobuf->append(reinterpret_cast(&k), 4); - iobuf->append(reinterpret_cast(&vlen), 8); - - // FIXME(gongwb): use append_zerocopy - /* - if (in_cuda_pinned) { - iobuf->append_zerocopy(v, vlen, IOBufWriter::FreeMemory); - } else { - iobuf->append_zerocopy(v, vlen, nullptr); - } - */ - iobuf->append(v, vlen); - destroy(user_data); - } - -#ifdef PADDLE_WITH_BRPC_RDMA - static void AppendRdmaZeroCopy(const std::string varname, butil::IOBuf* iobuf, - int k, const char* v, int64_t vlen, - bool in_cuda_pinned, void (*destroy)(void*), - void* user_data) { - VLOG(7) << "AppendRdmaZeroCopy varname:" << varname << " k:" << k - << " data:" << static_cast(const_cast(v)) - << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned; - - iobuf->append(reinterpret_cast(&k), 4); - iobuf->append(reinterpret_cast(&vlen), 8); - - RdmaMemPool::Instance().Register( - varname, static_cast(const_cast(v)), vlen); - - // FIXME(gongwb): use append_zerocopy - // iobuf->append_zerocopy(v, vlen, nullptr); - iobuf->append(v, vlen); - destroy(user_data); - return; - } -#endif - - static void AppendZeroCopy(const std::string varname, butil::IOBuf* iobuf, - int k, const char* v, int64_t vlen, - bool in_cuda_pinned, void (*destroy)(void*), - void* user_data) { - if (vlen >= std::numeric_limits::max() || vlen < 0) { - PADDDLE_THROW(platform::errors::Unavailable( - "Variable lenght is invalid. Variable name is %s, length is %d.", - varname, vlen)); - } - -#ifdef PADDLE_WITH_BRPC_RDMA - IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned, - destroy, user_data); -#else - IOBufWriter::AppendTCPZeroCopy(iobuf, k, v, vlen, in_cuda_pinned, destroy, - user_data); -#endif - } -}; - -void SerializeToIOBuf(const std::string& name, framework::Variable* var, - const platform::DeviceContext& ctx, VarMsg* request, - butil::IOBuf* iobuf, const std::string& out_varname, - bool var_is_not_stable, int trainer_id, - const std::string& table_name) { - std::unique_ptr payload; - - request->set_varname(name); - request->set_trainer_id(trainer_id); - // Note: normally the profiler is enabled in 1 trainer, hence only - // 1 trainer returns true for ShouldSendProfileState(). It tells PS - // servers the trainer's profiling state so that PS can follow the - // trainer. - if (platform::ShouldSendProfileState()) { - if (platform::IsProfileEnabled()) { - request->set_profile(platform::kEnableProfiler); - } else { - request->set_profile(platform::kDisableProfiler); - } - } - if (!out_varname.empty()) { - request->set_out_varname(out_varname); - } - if (!table_name.empty()) { - request->set_table_name(table_name); - } - if (var->IsType()) { - request->set_type(::sendrecv::LOD_TENSOR); - payload.reset(new TensorPayload(GetTensorPayload(var, ctx, request))); - } else if (var->IsType()) { - request->set_type(::sendrecv::SELECTED_ROWS); - payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request))); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - } else if (var->IsType()) { - request->set_type(::sendrecv::NCCL_ID); - const ncclUniqueId& uid = var->Get(); - // TODO(gongwb): use append_zero to avoid data copy. - IOBufWriter::Append(name, iobuf, - sendrecv::VariableMessage::kSerializedFieldNumber, - uid.internal, NCCL_UNIQUE_ID_BYTES); - return; -#endif - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Serialize does not support type: %s", typeid(var->Type()).name())); - } - - PADDLE_ENFORCE_NOT_NULL( - payload, - platform::errors::InvalidArgument( - "Not support type: %s, need to be LOD_TENSOR or SELECTED_ROWS.", - var->Type())); - - // FIXME(gongwb): it seems that can use zero copy. - if (var_is_not_stable) { - IOBufWriter::Append( - name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, - static_cast(payload->ptr()), payload->memory_size()); - } else { - if (platform::is_gpu_place(ctx.GetPlace())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - IOBufWriter::AppendZeroCopy( - name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, - static_cast(payload->ptr()), payload->memory_size(), - true, SerializeDestroyCallback, static_cast(payload.get())); - payload.release(); -#endif - } else { - IOBufWriter::AppendZeroCopy( - name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, - static_cast(payload->ptr()), payload->memory_size(), - false, SerializeDestroyCallback, static_cast(payload.get())); - payload.release(); - } - } - - if (var->IsType()) { - auto* slr = var->GetMutable(); - PADDLE_ENFORCE_EQ(VectorElemName(slr->rows()), typeid(int64_t).name(), - platform::errors::InvalidArgument( - "Got wrong type: %s, expect type: int64_t", - VectorElemName(slr->rows()))); - size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); - - IOBufWriter::Append(name, iobuf, - ::sendrecv::VariableMessage::kRowsFieldNumber, - reinterpret_cast(slr->rows().data()), - static_cast(rows_memory_size)); - } -} - -void DeserializeFromIOBuf(const ::sendrecv::VariableMessage& meta, - const butil::IOBuf& iobuf, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id) { - operators::distributed::BRPCVariableResponse resp(scope, &ctx); - PADDLE_ENFORCE_EQ( - resp.Parse(iobuf, meta), 0, - platform::errors::InvalidArgument("parse iobuf to tensor error!")); - *var = resp.GetVar(); - *trainer_id = resp.GetTrainerId(); -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h deleted file mode 100644 index a5bdc331eb29c7c0fe00d7f346025426b51e1cb3..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -#include "brpc/channel.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" - -namespace paddle { -namespace operators { -namespace distributed { - -void SerializeToIOBuf(const std::string& name, framework::Variable* var, - const platform::DeviceContext& ctx, VarMsg* request, - butil::IOBuf* iobuf, const std::string& out_varname, - bool var_is_not_stable, const int trainer_id = 0, - const std::string& table_name = std::string()); - -void DeserializeFromIOBuf(const VarMsg& meta, const butil::IOBuf& iobuf, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id); - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc deleted file mode 100644 index bcf20ad076b11f86870d8fbe980b8adad5c96ea8..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc +++ /dev/null @@ -1,175 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include // NOLINT - -#include "brpc/channel.h" -#include "google/protobuf/text_format.h" -#include "gtest/gtest.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/variable_response.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/printf.h" - -namespace framework = paddle::framework; -namespace platform = paddle::platform; -namespace operators = paddle::operators; -namespace math = paddle::operators::math; -namespace memory = paddle::memory; - -void RunSerdeTestSelectedRows(platform::Place place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - butil::IOBuf iobuf; - sendrecv::VariableMessage msg; - int tensor_numel = 564 * 128; - - // serialize var to IOBuf - { - framework::Variable var; - auto* slr = var.GetMutable(); - slr->set_height(1000); - auto* tensor = slr->mutable_value(); - auto* rows = slr->mutable_rows(); - tensor->Resize(framework::make_ddim({564, 128})); - tensor->mutable_data(place); - math::set_constant(ctx, tensor, 32.7); - for (int i = 0; i < 564; ++i) rows->push_back(i); - - operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf, - "", false); - } - - // desrialize - { - framework::Scope scope; - scope.Var("myvar"); - operators::distributed::BRPCVariableResponse resp(&scope, &ctx); - EXPECT_EQ(resp.Parse(iobuf, msg), 0); - - framework::Variable* var2 = resp.GetVar(); - - auto* slr2 = var2->GetMutable(); - auto* tensor2 = slr2->mutable_value(); - auto* rows2 = slr2->mutable_rows(); - float* tensor_data2 = nullptr; - framework::Tensor tmp_tensor; - - if (platform::is_gpu_place(ctx.GetPlace())) { - platform::CPUPlace cpu; - framework::TensorCopy(*tensor2, cpu, &tmp_tensor); - tensor_data2 = tmp_tensor.data(); - } else { - tensor_data2 = const_cast(tensor2->data()); - } - const int64_t* rows_data2 = rows2->data(); - - for (int i = 0; i < tensor_numel; ++i) { - EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); - } - for (size_t i = 0; i < rows2->size(); ++i) { - EXPECT_EQ(rows_data2[i], static_cast(i)); - } - EXPECT_EQ(slr2->height(), 1000); - } -} - -void RunTestLodTensor(platform::Place place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - // serialize var to ByteBuffer - butil::IOBuf iobuf; - sendrecv::VariableMessage msg; - int tensor_numel = 512 * 8 * 4 * 2; - { - framework::Variable var; - auto* tensor = var.GetMutable(); - tensor->Resize(framework::make_ddim({512, 8, 4, 2})); - framework::LoD lod; - lod.push_back(framework::Vector({1, 3, 8})); - tensor->set_lod(lod); - tensor->mutable_data(place); - math::set_constant(ctx, tensor, 31.9); - - operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf, - "", false); - } - - // check sendrecv::VariableMessage meta data - { - EXPECT_EQ(msg.varname(), "myvar"); - EXPECT_EQ(msg.type(), 0); - EXPECT_EQ(msg.dims()[0], 512); - EXPECT_EQ(msg.dims()[1], 8); - EXPECT_EQ(msg.dims()[2], 4); - EXPECT_EQ(msg.dims()[3], 2); - EXPECT_EQ(msg.lod_level(), 1); - EXPECT_EQ(msg.lod(0).lod_data(0), 1); - EXPECT_EQ(msg.lod(0).lod_data(1), 3); - EXPECT_EQ(msg.lod(0).lod_data(2), 8); - } - - // deserialize - { - framework::Scope scope; - scope.Var("myvar"); - operators::distributed::BRPCVariableResponse resp(&scope, &ctx); - EXPECT_EQ(resp.Parse(iobuf, msg), 0); - - framework::Variable* var2 = resp.GetVar(); - - auto tensor2 = var2->Get(); - float* tensor_data2 = nullptr; - framework::Tensor tmp_tensor; - - if (platform::is_gpu_place(ctx.GetPlace())) { - platform::CPUPlace cpu; - framework::TensorCopy(tensor2, cpu, &tmp_tensor); - tensor_data2 = tmp_tensor.data(); - } else { - tensor_data2 = const_cast(tensor2.data()); - } - - for (int i = 0; i < tensor_numel; ++i) - EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); - } -} - -TEST(LodTensor, Run) { - platform::CPUPlace place; - RunTestLodTensor(place); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::CUDAPlace gpu(0); - RunTestLodTensor(gpu); -#endif -} - -TEST(SelectedRows, Run) { - platform::CPUPlace place; - RunSerdeTestSelectedRows(place); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::CUDAPlace gpu; - RunSerdeTestSelectedRows(gpu); -#endif -} diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.cc b/paddle/fluid/operators/distributed/brpc/brpc_server.cc deleted file mode 100644 index 5ca26f006bf20e667aaad55baf844387dbd31020..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_server.cc +++ /dev/null @@ -1,417 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/brpc/brpc_server.h" -#include -#include -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" -#include "paddle/fluid/operators/distributed/request_handler.h" - -namespace sendrecv { - -namespace distributed = paddle::operators::distributed; - -typedef std::unordered_map - HandlerMap; - -class BRPCServiceImpl : public SendRecvService { - public: - explicit BRPCServiceImpl(const HandlerMap& rpc_call_map, - distributed::RPCServer* rpc_server) - : rpc_server_(rpc_server) { - VLOG(3) << "BRPCServiceImpl size: " << rpc_call_map.size(); - auto it = rpc_call_map.find(distributed::kRequestSend); - if (it != rpc_call_map.end()) { - request_send_h_ = it->second; - send_threads_.reset(new paddle::framework::ThreadPool( - rpc_server_->GetThreadNum(distributed::kRequestSend))); - } - - it = rpc_call_map.find(distributed::kRequestGet); - if (it != rpc_call_map.end()) { - request_get_h_ = it->second; - get_threads_.reset(new paddle::framework::ThreadPool( - rpc_server_->GetThreadNum(distributed::kRequestGet))); - } - - it = rpc_call_map.find(distributed::kRequestGetNoBarrier); - if (it != rpc_call_map.end()) { - request_getnobarrier_h_ = it->second; - getnobarrier_threads_.reset(new paddle::framework::ThreadPool( - rpc_server_->GetThreadNum(distributed::kRequestGetNoBarrier))); - } - - it = rpc_call_map.find(distributed::kRequestPrefetch); - if (it != rpc_call_map.end()) { - request_prefetch_h_ = it->second; - prefetch_threads_.reset(new paddle::framework::ThreadPool( - rpc_server_->GetThreadNum(distributed::kRequestPrefetch))); - } - - it = rpc_call_map.find(distributed::kRequestCheckpoint); - if (it != rpc_call_map.end()) { - request_checkpoint_h_ = it->second; - checkpoint_notify_threads_.reset(new paddle::framework::ThreadPool( - rpc_server_->GetThreadNum(distributed::kRequestPrefetch))); - } - - it = rpc_call_map.find(distributed::kRequestGetMonomerVariable); - if (it != rpc_call_map.end()) { - request_get_monomer_handler_h_ = it->second; - } - - it = rpc_call_map.find(distributed::kRequestGetMonomerBarrier); - if (it != rpc_call_map.end()) { - request_get_monomer_barrier_handler_h_ = it->second; - } - } - - virtual ~BRPCServiceImpl() {} - void SendVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VoidMessage* response, - google::protobuf::Closure* done) override { - send_threads_->Run( - [=] { _SendVariable(cntl_butil, request, response, done); }); - } - - void _SendVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VoidMessage* response, - google::protobuf::Closure* done) { - PADDLE_ENFORCE_NOT_NULL( - request_send_h_, platform::errors::PreconditionNotMet( - "RequestSend handler should be registed first!")); - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - std::string varname = request->varname(); - VLOG(3) << "RequestSend var_name:" << varname - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - distributed::BRPCVariableResponse resp(request_send_h_->scope(), - request_send_h_->dev_ctx(), - request_send_h_->distributed_mode()); - PADDLE_ENFORCE_EQ( - resp.Parse(cntl->request_attachment(), *request), 0, - platform::errors::InvalidArgument("parse iobuf to tensor error!")); - - auto scope = resp.GetMutableLocalScope(); - auto invar = resp.GetVar(); - int trainer_id = request->trainer_id(); - paddle::framework::Variable* outvar = nullptr; - - request_send_h_->Handle(varname, scope, invar, &outvar, trainer_id); - } - - void GetVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VariableMessage* response, - google::protobuf::Closure* done) override { - get_threads_->Run( - [=] { _GetVariable(cntl_butil, request, response, done); }); - } - - void GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, - VariableMessage* response, - google::protobuf::Closure* done) override { - getnobarrier_threads_->Run( - [=] { _GetVariableNoBarrier(cntl_butil, request, response, done); }); - } - - void _GetVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VariableMessage* response, - google::protobuf::Closure* done) { - PADDLE_ENFORCE_NOT_NULL( - request_get_h_, platform::errors::PreconditionNotMet( - "RequestGet handler should be registed first!")); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - std::string varname = request->varname(); - std::string out_varname = request->out_varname(); - VLOG(3) << "RequestGet varname:" << varname - << ", out_varname:" << out_varname - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - auto scope = request_get_h_->scope(); - paddle::framework::Variable* invar = nullptr; - int trainer_id = request->trainer_id(); - paddle::framework::Variable* outvar = nullptr; - - request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id, - out_varname); - - if (outvar) { - distributed::SerializeToIOBuf(out_varname, outvar, - *request_get_h_->dev_ctx(), response, - &cntl->response_attachment(), "", false); - } - } - - void _GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, - VariableMessage* response, - google::protobuf::Closure* done) { - PADDLE_ENFORCE_NOT_NULL( - request_getnobarrier_h_, - platform::errors::PreconditionNotMet( - "RequestGetNoBarrier handler should be registed first!")); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - std::string varname = request->varname(); - std::string out_varname = request->out_varname(); - int trainer_id = request->trainer_id(); - - VLOG(3) << "RequestGetNoBarrier varname:" << varname - << ", out_varname:" << out_varname << ", trainer_id:" << trainer_id - << ", from:" << cntl->remote_side(); - - auto scope = request_getnobarrier_h_->scope(); - paddle::framework::Variable* invar = nullptr; - paddle::framework::Variable* outvar = nullptr; - - request_getnobarrier_h_->Handle(varname, scope, invar, &outvar, trainer_id, - out_varname); - - if (outvar) { - distributed::SerializeToIOBuf( - out_varname, outvar, *request_getnobarrier_h_->dev_ctx(), response, - &cntl->response_attachment(), "", false); - } - } - - void PrefetchVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, - VariableMessage* response, - google::protobuf::Closure* done) override { - prefetch_threads_->Run( - [=] { _PrefetchVariable(cntl_butil, request, response, done); }); - } - - void _PrefetchVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, - VariableMessage* response, - google::protobuf::Closure* done) { - PADDLE_ENFORCE_NOT_NULL(request_prefetch_h_, - platform::errors::PreconditionNotMet( - "kRequestPrefetch handler should be registed first!"); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - // prefetch process... - std::string in_var_name = request->varname(); - std::string out_var_name = request->out_varname(); - VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name - << ", out_var_name: " << out_var_name - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - distributed::BRPCVariableResponse resp( - request_prefetch_h_->scope(), request_prefetch_h_->dev_ctx(), true); - - PADDLE_ENFORCE_EQ(resp.Parse(cntl->request_attachment(), *request), 0, - platform::errors::InvalidArgument( - "parse iobuf to tensor error!")); - - auto scope = resp.GetMutableLocalScope(); - auto invar = scope->FindVar(in_var_name); - std::string table_name = request->table_name(); - int trainer_id = request->trainer_id(); - paddle::framework::Variable* outvar = scope->Var(out_var_name); - - request_prefetch_h_->Handle(in_var_name, scope, invar, &outvar, trainer_id, - out_var_name, table_name); - - distributed::SerializeToIOBuf(out_var_name, outvar, - *request_prefetch_h_->dev_ctx(), response, - &cntl->response_attachment(), "", true); - } - - void CheckpointNotify(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VoidMessage* response, - google::protobuf::Closure* done) override { - checkpoint_notify_threads_->Run( - [=] { _CheckpointNotify(cntl_butil, request, response, done); }); - } - - void _CheckpointNotify(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VoidMessage* response, - google::protobuf::Closure* done) { - PADDLE_ENFORCE_NOT_NULL( - request_checkpoint_h_, - platform::errors::PreconditionNotMet( - "kRequestCheckpointNotify handler should be registed first!")); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - distributed::BRPCVariableResponse resp(request_checkpoint_h_->scope(), - request_checkpoint_h_->dev_ctx()); - - auto scope = resp.GetMutableLocalScope(); - - std::string checkpoint_notify = request->varname(); - std::string checkpoint_dir = request->out_varname(); - int trainer_id = request->trainer_id(); - - VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify - << ", dir: " << checkpoint_dir - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - request_checkpoint_h_->Handle(checkpoint_notify, scope, nullptr, nullptr, - trainer_id, checkpoint_dir); - } - - void GetMonomerVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, - VariableMessage* response, - google::protobuf::Closure* done) override { - PADDLE_ENFORCE_NOT_NULL( - request_get_monomer_handler_h_, - platform::errors::PreconditionNotMet( - "kRequestGetMonomerVariable handler should be registed first!")); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - // proc request. - std::string varname = request->varname(); - VLOG(3) << "GetMonomerVariable " << varname - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - rpc_server_->WaitVarCond(varname); - distributed::MonomerHandle h = rpc_server_->GetMonomer(varname); - - auto scope = h.scope_; - auto invar = scope->FindVar(varname); - paddle::framework::Variable* outvar = nullptr; - - request_get_monomer_handler_h_->Handle(varname, scope, invar, &outvar, - request->trainer_id()); - - if (outvar) { - distributed::SerializeToIOBuf(varname, outvar, *h.dev_ctx_, response, - &cntl->response_attachment(), "", false); - } - } - - void GetMonomerBarrier(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VoidMessage* response, - google::protobuf::Closure* done) override { - PADDLE_ENFORCE_NOT_NULL( - request_get_monomer_barrier_handler_h_, - platform::errors::PreconditionNotMet( - "RequestGetMonomerBarrier handler should be registed first!")); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - std::string varname = request->varname(); - VLOG(3) << "RequestGetMonomerBarrier var_name:" << varname - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - rpc_server_->WaitVarCond(varname); - distributed::MonomerHandle h = rpc_server_->GetMonomer(varname); - - paddle::framework::Scope* scope = nullptr; - paddle::framework::Variable* invar = nullptr; - paddle::framework::Variable* outvar = nullptr; - - request_get_monomer_barrier_handler_h_->Handle( - varname, scope, invar, &outvar, request->trainer_id()); - } - - private: - distributed::RequestHandler* request_send_h_{nullptr}; - distributed::RequestHandler* request_get_h_{nullptr}; - distributed::RequestHandler* request_getnobarrier_h_{nullptr}; - distributed::RequestHandler* request_prefetch_h_{nullptr}; - distributed::RequestHandler* request_checkpoint_h_{nullptr}; - distributed::RequestHandler* request_get_monomer_handler_h_{nullptr}; - distributed::RequestHandler* request_get_monomer_barrier_handler_h_{nullptr}; - - distributed::RPCServer* rpc_server_{nullptr}; - - // FIXME(gongwb): brpc should support process one rpc use one threadpool. - std::unique_ptr send_threads_; - std::unique_ptr get_threads_; - std::unique_ptr getnobarrier_threads_; - std::unique_ptr prefetch_threads_; - std::unique_ptr checkpoint_notify_threads_; -}; -} // namespace sendrecv - -namespace paddle { -namespace operators { -namespace distributed { - -void AsyncBRPCServer::StartServer() { - // Instance of your service. - sendrecv::BRPCServiceImpl service_impl(rpc_call_map_, this); - - // Add the service into server. Notice the second parameter, because the - // service is put on stack, we don't want server to delete it, otherwise - // use brpc::SERVER_OWNS_SERVICE. - if (server_.AddService(&service_impl, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) { - PADDDLE_THROW(platform::errors::Unavailable( - "Failed to add service into BRPC server.")); - return; - } - - brpc::ServerOptions options; -#ifdef PADDLE_WITH_BRPC_RDMA - options.use_rdma = true; -#endif - options.idle_timeout_sec = idle_timeout_s_; - options.max_concurrency = max_concurrency_; - if (server_.Start(bind_address_.c_str(), &options) != 0) { - PADDDLE_THROW(platform::errors::Unavailable( - "Failed to start EchoServer %s.", bind_address_)); - return; - } - - butil::EndPoint ep = server_.listen_address(); - selected_port_ = ep.port; - - { - std::lock_guard lock(this->mutex_ready_); - ready_ = 1; - } - condition_ready_.notify_all(); - - server_.Join(); -} - -void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); } - -void AsyncBRPCServer::WaitServerReady() { - VLOG(3) << "AsyncGRPCServer is wait server ready"; - std::unique_lock lock(this->mutex_ready_); - condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); - VLOG(3) << "AsyncGRPCServer WaitSeverReady"; -} - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.h b/paddle/fluid/operators/distributed/brpc/brpc_server.h deleted file mode 100644 index 78bbe5adc0813d7cf29963c78947d52bcaea9643..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_server.h +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include // NOLINT -#include // NOLINT -#include - -#include "brpc/server.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/rpc_server.h" - -namespace paddle { -namespace operators { -namespace distributed { - -class AsyncBRPCServer final : public RPCServer { - public: - explicit AsyncBRPCServer(const std::string& address, int client_num) - : RPCServer(address, client_num), ready_(0) {} - - virtual ~AsyncBRPCServer() {} - void StartServer() override; - void WaitServerReady() override; - - private: - void ShutDownImpl() override; - - brpc::Server server_; - - static constexpr int idle_timeout_s_ = -1; - static constexpr int max_concurrency_ = 0; - - std::mutex mutex_ready_; - std::condition_variable condition_ready_; - int ready_; -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc deleted file mode 100644 index 49521e8a77057bf496eca3249230a3fbc278c262..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" - -namespace paddle { -namespace operators { -namespace distributed { - -namespace pb = ::google::protobuf; -using vr = ::sendrecv::VariableMessage; - -int BRPCVariableResponse::Parse(Source* source) { - pb::io::ZeroCopyInputStream* input_stream = source->contents(); - pb::io::CodedInputStream input(input_stream); - input.SetTotalBytesLimit(INT_MAX, INT_MAX); - - while (1) { - unsigned int tag = 0; - if (!input.ReadLittleEndian32(&tag)) { - break; - } - - uint64_t num_bytes = 0; - if (!input.ReadLittleEndian64(&num_bytes)) { - break; - } - - int field = static_cast(tag); - int ret = field == 0 ? -1 : field; - switch (field) { - case vr::kSerializedFieldNumber: { - if (!ProcSerializedField(field, &input, num_bytes)) { - return ret; - } - break; - } - case vr::kRowsFieldNumber: { - PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || - meta_.type() == sendrecv::LOD_TENSOR) && - meta_.varname() != "", - platform::errors::PreconditionNotMet( - "meta info should be got first!")); - - if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) { - return ret; - } - break; - } - default: { - PADDLE_THROW(platform::errors::Unavailable( - "not surpported %u fieldnumber", field)); - return ret; - } - } - } - - return 0; -} -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h deleted file mode 100644 index 6282f08a725367f74dbcf1fa6a2ad49469d64725..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "brpc/channel.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/var_type.h" - -#include "paddle/fluid/operators/distributed/distributed_pb.h" - -#include "google/protobuf/io/coded_stream.h" -#include "google/protobuf/io/zero_copy_stream.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/distributed/variable_response.h" - -namespace paddle { -namespace operators { -namespace distributed { - -class BRPCSourceWrapper : public Source { - public: - explicit BRPCSourceWrapper(const butil::IOBuf& iobuf) : source_(iobuf) {} - ::google::protobuf::io::ZeroCopyInputStream* contents() override { - return &source_; - } - - private: - butil::IOBufAsZeroCopyInputStream source_; -}; - -class BRPCVariableResponse : public VariableResponse { - public: - BRPCVariableResponse(const framework::Scope* scope, - const platform::DeviceContext* dev_ctx, - bool create_scope = false) - : VariableResponse(scope, dev_ctx, create_scope) {} - - virtual ~BRPCVariableResponse() {} - - // parse attachment from iobuf - int Parse(Source* source) override; - int Parse(const butil::IOBuf& iobuf, const sendrecv::VariableMessage& meta) { - BRPCSourceWrapper wrapper(iobuf); - return VariableResponse::Parse(&wrapper, meta); - } -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_client.cc b/paddle/fluid/operators/distributed/collective_client.cc deleted file mode 100644 index fcd3e6abead510393736a9253af2ae1068357a68..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/collective_client.cc +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/collective_client.h" -#include -#include "gflags/gflags.h" - -DECLARE_int32(rpc_deadline); - -namespace paddle { -namespace operators { -namespace distributed { -std::once_flag CollectiveClient::init_flag_; -std::unique_ptr CollectiveClient::client_(nullptr); - -bool CollectiveClient::Gather(const std::vector& remote_vars, - std::vector* dst, - const platform::DeviceContext& ctx, - framework::Scope* scope, int64_t time_out) { - for (auto r : remote_vars) { - VLOG(50) << "begin gather from ep:" << r.String(); - scope->Var(r.var_name_)->GetMutable(); - VarHandlePtr ptr = rpc_client_->AsyncGetMonomerVariable( - r.ep_, ctx, *scope, r.var_name_, time_out); - } - - rpc_client_->Wait(); - - for (auto r : remote_vars) { - auto select_rows = - scope->FindVar(r.var_name_)->GetMutable(); - dst->push_back(select_rows); - - VLOG(4) << "gather from ep:" << r.String() - << ", select_rows:" << GetSelectedRowsInfo(*select_rows); - - rpc_client_->AsyncGetMonomerBarrier(r.ep_, r.var_name_); - } - - rpc_client_->Wait(); - return true; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_client.h b/paddle/fluid/operators/distributed/collective_client.h deleted file mode 100644 index e7d8bb8df9834728682ea131f3ef0d60786908e5..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/collective_client.h +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include // NOLINT -#include -#include -#include - -#include "gflags/gflags.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/request_handler.h" - -namespace paddle { -namespace framework { -class Scope; -class SelectedRows; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -DECLARE_int32(rpc_deadline); - -namespace paddle { -namespace operators { -namespace distributed { - -inline std::string GetSelectedRowsInfo(const framework::SelectedRows& slr) { - std::stringstream ss; - ss << ", height:" << slr.height() << ", rows:["; - for (unsigned int i = 0; i < slr.rows().size(); i++) { - if (i != slr.rows().size() - 1) { - ss << slr.rows()[i] << ","; - } else { - ss << slr.rows()[i]; - } - } - ss << "], dims:" << slr.value().dims(); - return ss.str(); -} - -struct RemoteVar { - std::string ep_; - std::string var_name_; - int trainer_id_{0}; - - std::string String() { - std::stringstream ss; - ss << "ep:" << ep_ << ", var_name:" << var_name_ - << ", trainer_id:" << trainer_id_; - - return ss.str(); - } -}; - -class CollectiveClient { - public: - CollectiveClient() { - rpc_client_.reset(new RPCCLIENT_T()); - rpc_client_->InitImpl(); - } - virtual ~CollectiveClient() {} - - // note this function will retain the rank order. - bool Gather(const std::vector& remote_vars, - std::vector* dst, - const platform::DeviceContext& ctx, framework::Scope* scope, - int64_t time_out = FLAGS_rpc_deadline); - - static CollectiveClient* GetInstance() { - std::call_once(init_flag_, [&]() { - if (client_.get() == nullptr) { - client_.reset(new CollectiveClient()); - } - }); - return client_.get(); - } - - private: - std::unique_ptr rpc_client_; - - static std::once_flag init_flag_; - static std::unique_ptr client_; -}; -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server.cc b/paddle/fluid/operators/distributed/collective_server.cc deleted file mode 100644 index cdd37742d2d5a5a882320cbff1e67a353b4af5f8..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/collective_server.cc +++ /dev/null @@ -1,68 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/distributed/collective_server.h" -#include - -DEFINE_int32(collective_get_thread_num, 5, "number of threads for rpc get"); - -namespace paddle { -namespace operators { -namespace distributed { - -std::once_flag CollectiveServer::init_flag_; -std::shared_ptr CollectiveServer::collective_server_(nullptr); - -CollectiveServer::CollectiveServer(const std::string& end_point, int fan_in) { - VLOG(1) << "Create colllective server:" << end_point << ", fan_in:" << fan_in; - rpc_server_.reset(new RPCSERVER_T(end_point, fan_in)); -} - -void CollectiveServer::Stop() { - rpc_server_->ShutDown(); - server_thread_->join(); - loop_thread_->join(); -} - -void CollectiveServer::StartServer() { - get_monomer_handler_.reset(new GetMonomerHandler()); - get_monomer_handler_->SetRPCServer(rpc_server_.get()); - - get_barrier_handler_.reset(new GetMonomerBarrierHandler()); - get_barrier_handler_->SetRPCServer(rpc_server_.get()); - - rpc_server_->RegisterRPC(distributed::kRequestGetMonomerVariable, - get_monomer_handler_.get(), - FLAGS_collective_get_thread_num); - rpc_server_->RegisterRPC(distributed::kRequestGetMonomerBarrier, - get_barrier_handler_.get(), 1); - - server_thread_.reset(new std::thread([&]() { rpc_server_->StartServer(); })); - rpc_server_->WaitServerReady(); - - loop_thread_.reset(new std::thread([&]() { - while (true) { - if (rpc_server_->IsExit()) { - LOG(WARNING) << "get exit!rpc_processor break!"; - break; - } - sleep(1); - } - VLOG(1) << "CollectiveServer loop_thread end"; - })); -} - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server.h b/paddle/fluid/operators/distributed/collective_server.h deleted file mode 100644 index 4964923286094abe01559b715a20eeb8da7e2a0d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/collective_server.h +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include // NOLINT -#include -#include -#include "gflags/gflags.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/operators/distributed/request_handler_impl.h" -#include "paddle/fluid/operators/distributed/rpc_server.h" - -namespace paddle { -namespace framework { -class Variable; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class CollectiveServer; - -class GetMonomerHandler final : public RequestHandler { - public: - GetMonomerHandler() : RequestHandler(true) {} - virtual ~GetMonomerHandler() {} - bool Handle(const std::string& var_name, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override { - VLOG(50) << "GetMonomerHandler recv " << var_name; - - *outvar = scope->FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL( - outvar, platform::errors::NotFound("var: %s is not found.", var_name)); - - return true; - } -}; - -class GetMonomerBarrierHandler final : public RequestHandler { - public: - GetMonomerBarrierHandler() : RequestHandler(true) {} - virtual ~GetMonomerBarrierHandler() {} - bool Handle(const std::string& var_name, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override { - VLOG(50) << "GetMonomerHandler recv " << var_name; - - rpc_server_->IncreaseVarBarrier(var_name); - - return true; - } -}; - -class CollectiveServer final { - public: - explicit CollectiveServer(const std::string& end_point, int fan_in); - - virtual ~CollectiveServer() {} - - void StartServer(); - - static CollectiveServer* GetInstance(const std::string& end_point, - int fan_in) { - std::call_once(init_flag_, [&]() { - if (collective_server_.get() == nullptr) { - collective_server_.reset(new CollectiveServer(end_point, fan_in)); - collective_server_->StartServer(); - } - }); - - return collective_server_.get(); - } - - std::shared_ptr GetRPCServer() { return rpc_server_; } - - void Stop(); - - private: - std::unique_ptr get_monomer_handler_; - std::unique_ptr get_barrier_handler_; - - std::shared_ptr rpc_server_; - std::shared_ptr server_thread_; - std::shared_ptr loop_thread_; - - bool ready_{false}; - - static std::once_flag init_flag_; - static std::shared_ptr collective_server_; -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc deleted file mode 100644 index 92b2eb4b51e59fec0991712ec4f6d6829b76cfb4..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/collective_server_test.cc +++ /dev/null @@ -1,131 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/operators/distributed/collective_client.h" -#include "paddle/fluid/operators/distributed/collective_server.h" - -namespace paddle { -namespace framework { -class Variable; -} // namespace framework -} // namespace paddle - -namespace framework = paddle::framework; -namespace platform = paddle::platform; -namespace distributed = paddle::operators::distributed; - -std::unique_ptr StartServer( - const std::string& ep, int fan_in, framework::Scope* scope, - platform::DeviceContext* dev_ctx) { - distributed::CollectiveServer* server = - distributed::CollectiveServer::GetInstance(ep, fan_in); - - auto rpc_server = server->GetRPCServer(); - rpc_server->RegisterVar("var1", distributed::kRequestGetMonomerVariable, - scope, dev_ctx); - - std::cout << "StartServer return" << std::endl; - return std::unique_ptr(server); -} - -std::unique_ptr GenerateVars(platform::Place place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - framework::Scope* scope = new framework::Scope(); - framework::Variable* var = scope->Var("var1"); - auto* slr = var->GetMutable(); - slr->set_height(20000); - - auto* tensor = slr->mutable_value(); - auto* rows = slr->mutable_rows(); - - tensor->Resize(framework::make_ddim({3, 1024})); - tensor->mutable_data(place); - - paddle::operators::math::set_constant(ctx, tensor, 32.7); - for (int i = 0; i < 3; ++i) rows->push_back(i); - - std::cout << "src:" << distributed::GetSelectedRowsInfo(*slr); - - return std::unique_ptr(scope); -} - -void Gather(const std::vector& vars, - platform::DeviceContext* dev_ctx) { - distributed::CollectiveClient* client = - distributed::CollectiveClient::GetInstance(); - - framework::Scope* scope = new framework::Scope(); - framework::Variable* var = scope->Var("var1"); - var->GetMutable(); - - std::vector dst; - client->Gather(vars, &dst, *dev_ctx, scope); - std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]); - dev_ctx->Wait(); - - ASSERT_EQ(dst[0]->value().dims(), framework::make_ddim({3, 1024})); - ASSERT_EQ(dst[0]->height(), 20000); - ASSERT_EQ(dst[0]->rows().size(), static_cast(3)); - for (int i = 0; i < 3; i++) { - ASSERT_EQ(dst[0]->rows()[i], i); - } - - std::vector vec; - TensorToVector(dst[0]->value(), *dev_ctx, &vec); - for (size_t i = 0; i < 3 * 1024; i++) { - ASSERT_FLOAT_EQ(vec[i], 32.7); - } -} - -TEST(CollectiveServer, GPU) { - setenv("http_proxy", "", 1); - setenv("https_proxy", "", 1); - - platform::CUDAPlace place; - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - std::string ep = "127.0.0.1:7164"; - auto scope = GenerateVars(place); - - auto* v1 = scope->FindVar("var1"); - std::cout << "var1:" << v1 << std::endl; - - auto server = StartServer(ep, 2, scope.get(), &ctx); - auto rpc_server = server->GetRPCServer(); - - distributed::RemoteVar var; - var.ep_ = ep; - var.var_name_ = "var1"; - var.trainer_id_ = 0; - - std::vector vars{var}; - Gather(vars, &ctx); - Gather(vars, &ctx); - - std::cout << "begin WaitVarBarrier" << std::endl; - rpc_server->WaitVarBarrier("var1"); - rpc_server->ClearRegisteredVars(); - server->Stop(); - - scope.release(); - server.release(); -} diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc deleted file mode 100644 index 4ee27a6414698fec7a5483195118551afac55f49..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/communicator.cc +++ /dev/null @@ -1,989 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/distributed/communicator.h" - -#include - -#include -#include // NOLINT -#include -#include // NOLINT -#include - -#include "gflags/gflags.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/framework/variable_helper.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/parameter_recv.h" -#include "paddle/fluid/operators/distributed/parameter_send.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/fluid/string/split.h" - -namespace paddle { -namespace operators { -namespace distributed { - -using Tree = - std::map>>; -using RpcCtxMap = operators::distributed::RpcCtxMap; - -inline double GetCurrentUS() { - struct timeval time; - gettimeofday(&time, NULL); - return 1e+6 * time.tv_sec + time.tv_usec; -} - -Communicator::Communicator() {} - -std::once_flag Communicator::init_flag_; -std::shared_ptr Communicator::communicator_(nullptr); - -void AsyncCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx, - const RpcCtxMap &recv_varname_to_ctx, - Scope *recv_scope) { - send_varname_to_ctx_ = std::move(send_varname_to_ctx); - recv_varname_to_ctx_ = std::move(recv_varname_to_ctx); - recv_scope_ = std::move(recv_scope); - - if (send_varname_to_ctx.size() == 0) { - VLOG(0) << "nothing need to be send, will not start send_thread"; - } else { - send_scope_.reset(new Scope()); - for (auto &iter : send_varname_to_ctx_) { - if (iter.first == STEP_COUNTER && !need_global_step_) continue; - send_varname_to_queue_[iter.first] = - std::make_shared>>( - send_queue_size_); - } - send_threadpool_.reset(new ::ThreadPool(thread_pool_size_)); - } - - if (recv_varname_to_ctx.size() == 0) { - VLOG(0) << "nothing need to be received, will not start recv_thread"; - } else { - recv_threadpool_.reset(new ::ThreadPool(thread_pool_size_)); - } - - InitParams(); -} - -void AsyncCommunicator::InitParams() { RecvNoBarrier(); } - -AsyncCommunicator::~AsyncCommunicator() { - running_ = false; - if (main_thread_) main_thread_->join(); -} - -void AsyncCommunicator::SendGlobalStep(int batches) { - if (!need_global_step_) { - return; - } - - if (batches == 0) { - return; - } - - auto &var_name = STEP_COUNTER; - auto *out_var = send_scope_->Var(var_name); - auto *out_t = out_var->GetMutable(); - auto *data = out_t->mutable_data({1}, platform::CPUPlace()); - data[0] = static_cast(batches); - - auto &ctx = send_varname_to_ctx_.at(var_name); - auto send_functor = distributed::ParameterSend(); - send_functor(ctx, *send_scope_, true, 1); -} - -void AsyncCommunicator::SendByCommunicator() { - std::vector> task_futures; - task_futures.reserve(send_varname_to_ctx_.size()); - VLOG(3) << "run send graph"; - - auto before_run_send_graph = GetCurrentUS(); - for (auto &iter : send_varname_to_queue_) { - auto &var_name = iter.first; - auto &var_queue = iter.second; - - auto send_task = [this, &var_name, &var_queue] { - VLOG(3) << var_name << " merge and send; "; - std::vector> vars; - - int merged_var_num = 0; - int wait_times = 0; - while (merged_var_num < max_merge_var_num_) { - if (var_queue->Size() == 0) { - VLOG(4) << "wait_times -> " << wait_times; - if (wait_times >= send_wait_times_) { - break; - } - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - wait_times++; - continue; - } else { - wait_times = 0; - - vars.push_back(var_queue->Pop()); - merged_var_num++; - } - } - auto before_merge = GetCurrentUS(); - if (var_name == STEP_COUNTER) { - SendGlobalStep(merged_var_num); - auto after_merge = GetCurrentUS(); - VLOG(3) << "merge and send " << merged_var_num << " " << var_name - << " use time " << after_merge - before_merge; - return; - } - - auto &ctx = send_varname_to_ctx_.at(var_name); - - MergeVars(var_name, vars, send_scope_.get(), ctx.merge_add); - auto after_merge = GetCurrentUS(); - VLOG(3) << "merge " << merged_var_num << " " << var_name << " use time " - << after_merge - before_merge; - - auto send_functor = distributed::ParameterSend(); - send_functor(ctx, *send_scope_, true, 1); - auto after_send = GetCurrentUS(); - VLOG(3) << "send " << var_name << " use time " - << after_send - after_merge; - - if (var_name.rfind("@GRAD") != var_name.size() - 5) return; - - auto recv_param = var_name.substr(0, var_name.size() - 5); - if (recv_varname_to_ctx_.find(recv_param) == recv_varname_to_ctx_.end()) - return; - - auto recv_functor = distributed::ParameterRecv(); - recv_functor(recv_varname_to_ctx_.at(recv_param), *recv_scope_); - auto after_recv = GetCurrentUS(); - VLOG(3) << "recv " << recv_param << " use time " - << after_recv - after_send; - }; - task_futures.emplace_back(send_threadpool_->enqueue(std::move(send_task))); - } - for (auto &task_f : task_futures) { - task_f.wait(); - } - auto after_run_send_graph = GetCurrentUS(); - - VLOG(3) << "run send graph use time " - << (after_run_send_graph - before_run_send_graph); -} - -void HalfAsyncCommunicator::SendByCommunicator() { - std::vector> task_futures; - task_futures.reserve(send_varname_to_ctx_.size()); - VLOG(3) << "run send graph"; - - int batches = BatchesCounter(); - if (batches <= 0) return; - - auto before_run_send_graph = GetCurrentUS(); - for (auto &iter : send_varname_to_queue_) { - auto &var_name = iter.first; - auto &var_queue = iter.second; - - auto send_task = [this, batches, &var_name, &var_queue] { - VLOG(3) << var_name << " merge and send; "; - auto before_task = GetCurrentUS(); - std::vector> vars; - vars.reserve(batches); - - for (int i = 0; i < batches; ++i) { - vars.push_back(var_queue->Pop()); - } - - if (var_name == STEP_COUNTER) { - SendGlobalStep(batches); - auto end_task = GetCurrentUS(); - VLOG(3) << "merge " << batches << " " << var_name << " use time " - << end_task - before_task; - return; - } - - auto &ctx = send_varname_to_ctx_.at(var_name); - - auto before_merge = GetCurrentUS(); - MergeVars(var_name, vars, send_scope_.get(), ctx.merge_add); - auto after_merge = GetCurrentUS(); - VLOG(3) << "merge " << batches << " " << var_name << " use time " - << after_merge - before_merge; - - auto send_functor = distributed::ParameterSend(); - send_functor(ctx, *send_scope_, true, 1); - auto after_send = GetCurrentUS(); - VLOG(3) << "send " << var_name << " use time " - << after_send - before_task; - - if (var_name.rfind("@GRAD") != var_name.size() - 5) return; - - auto recv_param = var_name.substr(0, var_name.size() - 5); - if (recv_varname_to_ctx_.find(recv_param) == recv_varname_to_ctx_.end()) - return; - - auto recv_functor = distributed::ParameterRecv(); - recv_functor(recv_varname_to_ctx_.at(recv_param), *recv_scope_); - auto after_recv = GetCurrentUS(); - VLOG(3) << "recv " << recv_param << " use time " - << after_recv - after_send; - return; - }; - task_futures.emplace_back(send_threadpool_->enqueue(std::move(send_task))); - } - for (auto &task_f : task_futures) { - task_f.wait(); - } - auto after_run_send_graph = GetCurrentUS(); - - VLOG(3) << "run send graph use time " - << (after_run_send_graph - before_run_send_graph); -} - -void AsyncCommunicator::MainThread() { - VLOG(3) << "MainThread start and wait"; - - while (waiting_ && running_) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - VLOG(3) << "wait for running"; - } - - while (running_) { - SendByCommunicator(); - BarrierSend(); - } - VLOG(3) << "communicator stopped, send thread exit"; -} - -void HalfAsyncCommunicator::MainThread() { - VLOG(3) << "MainThread start and wait"; - - while (waiting_ && running_) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - VLOG(3) << "wait for running"; - } - - while (running_) { - SendByCommunicator(); - BarrierSend(); - RecvByCommunicator(); - BarrierRecv(); - BarrierWeakUp(); - } - VLOG(3) << "communicator stopped, send thread exit"; -} - -void AsyncCommunicator::RecvByCommunicator() { - VLOG(3) << "parallel run recv graph"; - if (!running_) return; - RecvNoBarrier(); - VLOG(3) << "run recv graph use time"; -} - -void AsyncCommunicator::RecvNoBarrier() { - std::vector> task_futures; - task_futures.reserve(recv_varname_to_ctx_.size()); - - for (auto &iter : recv_varname_to_ctx_) { - auto recv_task = [this, &iter] { - auto before_task = GetCurrentUS(); - auto &var_name = iter.first; - auto recv_functor = distributed::ParameterRecv(); - recv_functor(iter.second, *recv_scope_); - auto end_task = GetCurrentUS(); - VLOG(1) << "recv var " << var_name << " use time " - << (end_task - before_task); - }; - task_futures.emplace_back(recv_threadpool_->enqueue(std::move(recv_task))); - } - - for (auto &task : task_futures) { - task.wait(); - } -} - -void AsyncCommunicator::Start() { - VLOG(3) << "Communicator start"; - if (!communicator_) { - VLOG(0) << "Communicator is not inited, do nothing"; - } else { - VLOG(3) << "start send thread and recv thread"; - waiting_ = true; - running_ = true; - BarrierTriggerReset(max_merge_var_num_); - // start send and recv thread - main_thread_.reset( - new std::thread(std::bind(&AsyncCommunicator::MainThread, this))); - } -} - -void AsyncCommunicator::Stop() { - VLOG(3) << "Communicator stop"; - running_ = false; - if (!communicator_) { - VLOG(0) << "Communicator is not inited, do nothing"; - } else { - if (main_thread_) { - VLOG(3) << "stop send thread"; - main_thread_->join(); - main_thread_.reset(nullptr); - } - } - VLOG(3) << "Communicator stop done"; -} - -void AsyncCommunicator::Send(const std::vector &var_names, - const std::vector &var_tables, - const framework::Scope &scope) { - waiting_ = false; - - PADDLE_ENFORCE_EQ( - var_tables.size(), 1, - platform::errors::InvalidArgument("var_tables.size() == 1 is permitted")); - - auto table_name = var_tables[0]; - - if (table_name == STEP_COUNTER && !need_global_step_) return; - - auto before_send_op = GetCurrentUS(); - auto &queue = send_varname_to_queue_.at(table_name); - - if (table_name == STEP_COUNTER) { - auto tmp_var = std::make_shared(); - auto *tensor = tmp_var->GetMutable(); - tensor->Resize(framework::make_ddim({1})); - auto *out_d = tensor->mutable_data(platform::CPUPlace()); - out_d[0] = 1; - queue->Push(tmp_var); - } else { - PADDLE_ENFORCE_GE(var_names.size(), 1, - platform::errors::InvalidArgument( - "var_names.size() >= 1 is permitted")); - - auto *var = scope.FindVar(var_names[0]); - - PADDLE_ENFORCE_EQ( - var->IsInitialized(), true, - platform::errors::InvalidArgument("grad var should be inited")); - - auto tmp_var = std::make_shared(); - if (var->IsType()) { - framework::CopyVariable(*var, tmp_var.get()); - queue->Push(tmp_var); - } else if (var->IsType()) { - // push var into send queue by var_name - auto var_name = var_names[0]; - framework::CopyVariable(*var, tmp_var.get()); - queue->Push(tmp_var); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "unknown var type to copy, only support LoDTensor/SelectedRows")); - } - } - auto after_send_op = GetCurrentUS(); - VLOG(3) << "send to " << table_name << " with queue size " << queue->Size() - << ", use time " << (after_send_op - before_send_op); -} - -void HalfAsyncCommunicator::Clean() { - for (auto &iter : send_varname_to_queue_) { - auto &var_name = iter.first; - auto &var_queue = iter.second; - - while (var_queue->Size() > 0) { - var_queue->Pop(); - } - - VLOG(3) << "clean var: " << var_name << " done"; - } -} - -int HalfAsyncCommunicator::BatchesCounter() { - while (running_) { - if (barrier_counter_.load() >= barrier_trigger_.load() && - barrier_trigger_.load() != 0) { - break; - } else { - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - } - } - - return barrier_counter_.load(); -} - -void HalfAsyncCommunicator::Barrier() { - barrier_counter_++; - - if (!running_) { - VLOG(3) << "Communicator is not running, release barrier"; - return; - } - - { - std::unique_lock lk(barrier_mutex_); - barrier_cond_.wait(lk, [this] { return (barrier_counter_ == 0); }); - } -} - -void HalfAsyncCommunicator::BarrierTriggerDecrement() { - barrier_trigger_--; - VLOG(3) << "BarrierTriggerDecrement decrement barrier trigger to " - << barrier_trigger_.load(); -} - -void HalfAsyncCommunicator::BarrierTriggerReset(int initial_val) { - barrier_trigger_.store(initial_val); - - VLOG(3) << "BarrierTriggerReset reset barrier trigger to " - << barrier_trigger_.load(); -} - -void HalfAsyncCommunicator::BarrierWeakUp() { - barrier_counter_.store(0); - barrier_cond_.notify_all(); -} - -void SyncCommunicator::BarrierSend() { - if (!running_) return; - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(trainer_id_); - - std::vector rets; - - for (auto &ep : pserver_endpoints_) { - rets.push_back(rpc_client->AsyncSendBatchBarrier(ep)); - } - - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::External( - "internal error in RPCClient")); - } - - VLOG(4) << "BarrierSend with SyncCommunicator"; -} - -void SyncCommunicator::BarrierRecv() { - if (!running_) return; - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(trainer_id_); - - std::vector rets; - for (auto &ep : pserver_endpoints_) { - rets.push_back(rpc_client->AsyncSendFetchBarrier(ep)); - } - - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::External( - "internal error in RPCClient")); - } - - VLOG(4) << "BarrierRecv with SyncCommunicator"; -} - -void GeoCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx, - const RpcCtxMap &recv_varname_to_ctx, - Scope *recv_scope) { - send_varname_to_ctx_ = std::move(send_varname_to_ctx); - recv_varname_to_ctx_ = std::move(recv_varname_to_ctx); - recv_scope_ = std::move(recv_scope); - - PADDLE_ENFORCE_GT( - send_varname_to_ctx.size(), 0, - platform::errors::InvalidArgument("send var contexts can not be zero")); - - send_scope_.reset(new Scope()); - for (auto &iter : send_varname_to_ctx_) { - auto &varname = iter.first; - - if (varname == STEP_COUNTER) { - send_varname_to_queue_[varname] = - std::make_shared>>( - send_queue_size_); - } else { - auto &send_ctx = iter.second; - - send_var_nums_ += send_ctx.splited_varnames.size(); - if (!send_ctx.is_sparse) { - continue; - } - int pserver_num = static_cast(send_ctx.epmap.size()); - for (int ep_idx = 0; ep_idx < pserver_num; ep_idx++) { - sparse_id_queues_.insert( - std::pair>>>>( - send_ctx.splited_varnames[ep_idx], - std::make_shared< - BlockingQueue>>>( - send_queue_size_))); - } - } - } - send_threadpool_.reset(new ::ThreadPool(thread_pool_size_)); - - if (recv_varname_to_ctx.size() == 0) { - VLOG(0) << "nothing need to be received, will not start recv_thread"; - } else { - recv_threadpool_.reset(new ::ThreadPool(thread_pool_size_)); - } - - delta_scope_.reset(new Scope()); - old_scope_.reset(new Scope()); - pserver_scope_.reset(new Scope()); - - InitParams(); -} - -void GeoCommunicator::Send(const std::vector &var_names, - const std::vector &var_tables, - const framework::Scope &scope) { - waiting_ = false; - PADDLE_ENFORCE_EQ( - var_tables.size(), 1, - platform::errors::InvalidArgument("var_tables.size() == 1 is permitted")); - - auto table_name = var_tables[0]; - if (table_name == STEP_COUNTER) return; - - auto before_send = GetCurrentUS(); - size_t splited_var_nums = - send_varname_to_ctx_[table_name].splited_varnames.size(); - - std::unordered_map> ids_table; - - for (size_t j = 0; j < splited_var_nums; j++) { - ids_table.insert(std::pair>( - send_varname_to_ctx_[table_name].splited_varnames[j], - std::unordered_set())); - } - auto *var = scope.FindVar(var_names[0]); - auto &rows = var->Get().rows(); - - // insert ids which has not been record - for (size_t j = 0; j < rows.size(); j++) { - auto ep_idx = rows[j] % splited_var_nums; - ids_table.at(send_varname_to_ctx_[table_name].splited_varnames[ep_idx]) - .insert(rows[j]); - } - - auto before_push = GetCurrentUS(); - for (auto &iter : ids_table) { - auto &key = iter.first; - auto &sparse_ids_set = iter.second; - auto sparse_ids_vec = std::make_shared>(); - sparse_ids_vec->assign(sparse_ids_set.begin(), sparse_ids_set.end()); - sparse_id_queues_.at(key)->Push(sparse_ids_vec); - VLOG(3) << "push " << sparse_ids_vec->size() << " ids to " << key - << "'s queue"; - } - auto after_send = GetCurrentUS(); - VLOG(3) << "run send " << table_name << " op finish. using " - << (before_push - before_send) << "; " << (after_send - before_push); -} - -void GeoCommunicator::MainThread() { - VLOG(3) << "MainThread start and wait"; - - while (waiting_ && running_) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - VLOG(3) << "wait for running"; - } - - while (running_) { - std::vector> tasks; - tasks.reserve(send_var_nums_); - - for (auto &iter : send_varname_to_ctx_) { - auto &var_name = iter.first; - auto &send_ctx = iter.second; - int pserver_num = static_cast(send_ctx.epmap.size()); - if (send_ctx.is_sparse) { - for (int ep_idx = 0; ep_idx < pserver_num; ep_idx++) { - auto send_recv_task = [this, ep_idx, &var_name] { - auto before_send_sparse = GetCurrentUS(); - if (var_name == STEP_COUNTER) { - return; - } - auto send_varname = - send_varname_to_ctx_.at(var_name).splited_varnames[ep_idx]; - auto sparse_ids = MergeSparseIds(send_varname); - if (sparse_ids.size() == 0) { - return; - } - SendSparse(var_name, ep_idx, sparse_ids); - auto after_send_sparse = GetCurrentUS(); - RecvSparse(var_name, ep_idx); - auto after_recv_sparse = GetCurrentUS(); - VLOG(3) - << "send recv " - << send_varname_to_ctx_.at(var_name).splited_varnames[ep_idx] - << " finish, using " << (after_send_sparse - before_send_sparse) - << " and " << (after_recv_sparse - after_send_sparse) - << "; total = " << (after_recv_sparse - before_send_sparse); - }; - tasks.emplace_back( - send_threadpool_->enqueue(std::move(send_recv_task))); - } - } else { - auto send_recv_task = [this, &var_name, &send_ctx] { - if (var_name == STEP_COUNTER) { - return; - } - SendDense(var_name); - RecvDense(var_name); - }; - tasks.emplace_back( - send_threadpool_->enqueue(std::move(send_recv_task))); - } - } - for (auto &task : tasks) { - task.wait(); - } - } -} - -std::vector GeoCommunicator::MergeSparseIds( - const std::string &send_varname) { - size_t merge_num = 0, wait_times = 0; - std::unordered_set sparse_ids; - while (merge_num < static_cast(max_merge_var_num_)) { - VLOG(3) << "Merge Number of " << send_varname << " = " << merge_num; - if (sparse_id_queues_.at(send_varname)->Size() > 0) { - wait_times = 0; - std::shared_ptr> pop_ids = - sparse_id_queues_.at(send_varname)->Pop(); - for (size_t j = 0; j < pop_ids->size(); j++) { - sparse_ids.insert(pop_ids->at(j)); - } - merge_num += 1; - VLOG(3) << "sparse_id_queues_(" << send_varname << ") pushed"; - } else if (sparse_id_queues_.at(send_varname)->Size() == 0) { - VLOG(3) << "wait_times -> " << wait_times; - if (wait_times >= static_cast(send_wait_times_)) { - break; - } - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - wait_times++; - continue; - } - } - std::vector res; - res.assign(sparse_ids.begin(), sparse_ids.end()); - return res; -} -void GeoCommunicator::SendSparse(const std::string &varname, int ep_idx, - const std::vector &sparse_ids) { - auto &rpc_ctx = send_varname_to_ctx_.at(varname); - auto send_varname = rpc_ctx.splited_varnames[ep_idx]; - auto trainer_id = rpc_ctx.trainer_id; - auto endpoint = rpc_ctx.epmap[ep_idx]; - auto pserver_num = rpc_ctx.epmap.size(); - - auto *var_latest = recv_scope_->FindVar(varname); - - PADDLE_ENFORCE_EQ(var_latest->IsInitialized(), true, - platform::errors::Unavailable( - "%s is not initialized, please check", varname)); - auto &t_latest = var_latest->Get(); - - auto dims1 = t_latest.dims()[1]; - - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - auto *var_delta = delta_scope_->Var(send_varname); - auto *t_delta = var_delta->GetMutable(); - - auto *t_value = t_delta->mutable_value(); - t_value->mutable_data( - framework::make_ddim({static_cast(sparse_ids.size()), dims1}), - cpu_ctx.GetPlace()); - - std::vector *>> values; - auto *ins = distributed::LargeScaleKV::GetInstance(); - ins->Get(varname)->Get(sparse_ids, {"Param"}, &values); - - auto blas = math::GetBlas(cpu_ctx); - float coefficient = 1.0 / static_cast(trainers_); - - for (auto j = 0; j < static_cast(sparse_ids.size()); ++j) { - blas.VSUB(dims1, t_latest.data() + sparse_ids[j] * dims1, - values[j][0]->data(), t_value->data() + j * dims1); - blas.SCAL(dims1, coefficient, t_value->data() + j * dims1); - blas.VADD(dims1, values[j][0]->data(), t_value->data() + j * dims1, - values[j][0]->data()); - } - - std::vector send_rows; - send_rows.reserve(sparse_ids.size()); - for (auto idx : sparse_ids) { - send_rows.push_back(idx / pserver_num); - } - t_delta->set_height(rpc_ctx.height_sections[ep_idx]); - t_delta->set_rows(send_rows); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &cpu_ctx_send = *pool.Get(platform::CPUPlace()); - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(trainer_id); - - auto ret = rpc_client->AsyncSendVar(endpoint, cpu_ctx_send, - *delta_scope_.get(), send_varname); - ret->Wait(); -} - -void GeoCommunicator::SendDense(const std::string &varname) { - auto *var_latest = recv_scope_->FindVar(varname); - auto *var_timestamp = old_scope_->FindVar(varname); - - PADDLE_ENFORCE_EQ(var_latest->IsInitialized(), true, - platform::errors::Unavailable( - "%s is not initialized, please check", varname)); - PADDLE_ENFORCE_EQ(var_timestamp->IsInitialized(), true, - platform::errors::Unavailable( - "%s is not initialized, please check", varname)); - - auto &t_latest = var_latest->Get(); - auto t_timestamp = var_timestamp->GetMutable(); - - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - auto *var_delta = delta_scope_->Var(varname); - auto *t_delta = var_delta->GetMutable(); - t_delta->mutable_data(t_latest.dims(), cpu_ctx.GetPlace()); - - auto blas = math::GetBlas(cpu_ctx); - blas.VSUB(t_latest.numel(), t_latest.data(), - t_timestamp->data(), t_delta->data()); - - float coefficient = 1.0 / static_cast(trainers_); - blas.SCAL(t_latest.numel(), coefficient, t_delta->data()); - - blas.VADD(t_latest.numel(), t_timestamp->data(), - t_delta->data(), t_timestamp->data()); - - auto &ctx = send_varname_to_ctx_.at(varname); - auto send = distributed::ParameterSend(); - send(ctx, *delta_scope_, true, 1); -} - -void GeoCommunicator::RecvByCommunicator() { return; } - -void GeoCommunicator::RecvSparse(const std::string &varname, int ep_idx) { - auto train_id = recv_varname_to_ctx_.at(varname).trainer_id; - auto endpoint = recv_varname_to_ctx_.at(varname).epmap[ep_idx]; - auto splited_var_name = - recv_varname_to_ctx_.at(varname).splited_varnames[ep_idx]; - auto pserver_num = recv_varname_to_ctx_.at(varname).epmap.size(); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &cpu_ctx_recv = *pool.Get(platform::CPUPlace()); - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(train_id); - - auto *var_psrever = pserver_scope_->Var(splited_var_name); - auto handle = rpc_client->AsyncGetVar(endpoint, cpu_ctx_recv, - *pserver_scope_.get(), splited_var_name, - splited_var_name, splited_var_name); - handle->Wait(); - - auto *var_latest = recv_scope_->FindVar(varname); - - PADDLE_ENFORCE_EQ( - var_psrever->IsInitialized(), true, - platform::errors::Unavailable( - "%s in pserver scope is not initialized, please check", varname)); - - std::vector ids; - ids.assign(var_psrever->Get().rows().begin(), - var_psrever->Get().rows().end()); - - for (size_t j = 0; j < ids.size(); j++) { - ids[j] = ids[j] * pserver_num + ep_idx; - } - - VLOG(3) << "RecvSparse receive var: " << splited_var_name - << " ids Size: " << ids.size(); - - auto t_psrever = var_psrever->Get().value(); - - std::vector *>> old_values; - - auto *ins = distributed::LargeScaleKV::GetInstance(); - ins->Get(varname)->Get(ids, {"Param"}, &old_values); - - auto *t_latest = var_latest->GetMutable(); - - auto dims1 = t_latest->dims()[1]; - auto numel = ids.size() * dims1; - - std::vector v_delta; - v_delta.resize(numel); - - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - auto blas = math::GetBlas(cpu_ctx); - - for (auto j = 0; j < static_cast(ids.size()); ++j) { - blas.VSUB(dims1, t_psrever.data() + j * dims1, - old_values[j][0]->data(), v_delta.data() + j * dims1); - blas.VADD(dims1, t_latest->data() + ids[j] * dims1, - v_delta.data() + j * dims1, - t_latest->data() + ids[j] * dims1); - blas.VCOPY(dims1, t_psrever.data() + j * dims1, - old_values[j][0]->data()); - } -} - -void GeoCommunicator::RecvDense(const std::string &varname) { - auto *var_latest = recv_scope_->FindVar(varname); - auto *var_timestamp = old_scope_->FindVar(varname); - auto *var_psrever = pserver_scope_->Var(varname); - - auto &ctx = recv_varname_to_ctx_.at(varname); - auto recv = distributed::ParameterRecv(); - recv(ctx, *pserver_scope_); - - PADDLE_ENFORCE_EQ( - var_psrever->IsInitialized(), true, - platform::errors::Unavailable( - "%s in pserver scope is not initialized, please check", varname)); - - auto t_psrever = var_psrever->Get(); - auto t_latest = var_latest->GetMutable(); - auto t_timestamp = var_timestamp->GetMutable(); - - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - auto *var_delta = delta_scope_->Var(varname); - auto *t_delta = var_delta->GetMutable(); - t_delta->mutable_data(t_latest->dims(), cpu_ctx.GetPlace()); - - auto blas = math::GetBlas(cpu_ctx); - blas.VSUB(t_latest->numel(), t_psrever.data(), - t_timestamp->data(), t_delta->data()); - blas.VADD(t_latest->numel(), t_latest->data(), t_delta->data(), - t_latest->data()); - blas.VCOPY(t_latest->numel(), t_psrever.data(), - t_timestamp->data()); -} - -void GeoCommunicator::InitParams() { - std::vector> tasks; - tasks.reserve(recv_varname_to_ctx_.size()); - - for (auto &iter : recv_varname_to_ctx_) { - auto &var_name = iter.first; - auto &recv_ctx = iter.second; - - auto recv_task = [this, &var_name, &recv_ctx] { - if (!recv_ctx.is_sparse) { - InitDense(var_name); - } - }; - tasks.emplace_back(send_threadpool_->enqueue(std::move(recv_task))); - } - - for (auto &task : tasks) { - task.wait(); - } - InitSparse(); -} - -void GeoCommunicator::InitDense(const std::string varname) { - auto &ctx = recv_varname_to_ctx_.at(varname); - auto recv = distributed::ParameterRecv(); - recv(ctx, *recv_scope_); - - auto *global_var = recv_scope_->FindVar(varname); - global_var->GetMutable(); - - auto *old_var = old_scope_->Var(varname); - old_var->GetMutable(); - - framework::CopyVariable(*global_var, old_var); - VLOG(1) << "init dense variable " << varname << " done"; -} - -void GeoCommunicator::InitSparse() { - auto sparse_metas = string::split_string(sparse_attrs_, "#"); - - std::vector metas; - std::vector dicts; - - for (auto &sparse_meta : sparse_metas) { - auto attrs = string::split_string(sparse_meta, ":"); - - auto meta = distributed::SparseMeta(); - meta.name = attrs[0]; - meta.value_names = {"Param"}; - - auto dic = string::split_string(attrs[1], ","); - dicts.push_back(std::stoi(dic[0])); - meta.value_dims = {std::stoi(dic[1])}; - meta.mode = distributed::Mode::training; - meta.grad_name = "none"; - meta.cached_varnames = {}; - meta.initializer_attrs = string::split_string(attrs[2]); - meta.entry = "none"; - - VLOG(3) << "add sparse meta: " << meta.ToString(); - metas.push_back(meta); - } - - LargeScaleKV::Init(metas); - - for (auto &meta : metas) { - auto &ctx = recv_varname_to_ctx_.at(meta.name); - auto recv = distributed::ParameterRecv(); - - auto *global_var = recv_scope_->FindVar(meta.name); - auto global_value = global_var->Get(); - auto rows = global_value.dims()[0]; - auto dim1 = global_value.dims()[1]; - - recv(ctx, *recv_scope_); - VLOG(1) << "recv " << meta.name << " with global scope for init"; - - auto n_rows = global_var->Get().dims()[0]; - - PADDLE_ENFORCE_EQ( - rows, n_rows, - platform::errors::InvalidArgument( - "global var: %s origin dim must equal recved rows", meta.name)); - - std::vector ids(rows); - std::iota(ids.begin(), ids.end(), 0); - - auto *ins = distributed::LargeScaleKV::GetInstance(); - std::vector *>> values; - - ins->Get(meta.name)->Init(ids); - ins->Get(meta.name)->Get(ids, {"Param"}, &values); - - auto blas = math::GetBlas( - paddle::platform::CPUDeviceContext()); - - for (auto &id : ids) { - blas.VCOPY(dim1, global_value.data() + id * dim1, - values[id][0]->data()); - } - } - - VLOG(3) << "init sparse variable done"; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h deleted file mode 100644 index 4be3253d3923f8536f9bb3d455f9bd6c12d67184..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/communicator.h +++ /dev/null @@ -1,490 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "gflags/gflags.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/framework/variable_helper.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/large_scale_kv.h" -#include "paddle/fluid/operators/distributed/rpc_client.h" -#include "paddle/fluid/operators/distributed_ops/send_recv_util.h" -#include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/operators/math/selected_rows_functor.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/split.h" - -DECLARE_bool(communicator_is_sgd_optimizer); - -namespace paddle { -namespace operators { -namespace distributed { - -using Scope = framework::Scope; -using Variable = framework::Variable; - -template -class BlockingQueue { - public: - explicit BlockingQueue(size_t capacity) : capacity_(capacity) { - PADDLE_ENFORCE_GT(capacity_, 0, - platform::errors::InvalidArgument( - "The capacity must be greater than 0.")); - } - - bool Push(const T &elem) { - { - std::unique_lock lock(mutex_); - cv_.wait(lock, [&] { return queue_.size() < capacity_; }); - PADDLE_ENFORCE_LT( - queue_.size(), capacity_, - platform::errors::OutOfRange("The queue size: %s out of capacity:%s", - queue_.size(), capacity_)); - queue_.push_back(elem); - } - cv_.notify_one(); - return true; - } - - bool Push(T &&elem) { - { - std::unique_lock lock(mutex_); - cv_.wait(lock, [&] { return queue_.size() < capacity_; }); - PADDLE_ENFORCE_LT( - queue_.size(), capacity_, - platform::errors::OutOfRange("The queue size: %s out of capacity:%s", - queue_.size(), capacity_)); - queue_.emplace_back(std::move(elem)); - } - cv_.notify_one(); - return true; - } - - T Pop() { - std::unique_lock lock(mutex_); - cv_.wait(lock, [=] { return !queue_.empty(); }); - T rc(std::move(queue_.front())); - queue_.pop_front(); - cv_.notify_one(); - return rc; - } - - size_t Cap() const { - std::lock_guard lock(mutex_); - return capacity_; - } - - size_t Size() const { - std::lock_guard lock(mutex_); - return queue_.size(); - } - - private: - const size_t capacity_; - std::deque queue_; - - mutable std::mutex mutex_; - std::condition_variable cv_; -}; - -template -using EigenVector = framework::EigenVector; - -template -inline void MergeVars(const std::string &var_name, - const std::vector> &vars, - Scope *scope, bool merge_add = true) { - PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument( - "vector vars are empty.")); - auto cpu_place = platform::CPUPlace(); - auto &var0 = vars[0]; - auto *out_var = scope->Var(var_name); - if (var0->IsType()) { - auto dims = var0->Get().dims(); - VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims - << "; merge add: " << merge_add; - // init output tensor - auto *out_t = out_var->GetMutable(); - out_t->mutable_data(dims, cpu_place); - // check the input dims - for (auto &var : vars) { - auto &var_t = var->Get(); - PADDLE_ENFORCE_EQ( - var_t.dims(), dims, - platform::errors::InvalidArgument("vars should have the same dims")); - } - - // set output tensor to 0. - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - math::SetConstant constant_functor; - constant_functor(cpu_ctx, out_t, static_cast(0)); - // sum all vars to out - auto result = EigenVector::Flatten(*out_t); - for (auto &var : vars) { - auto &in_t = var->Get(); - auto in = EigenVector::Flatten(in_t); - result.device(*cpu_ctx.eigen_device()) = result + in; - } - if (!merge_add) { - result.device(*cpu_ctx.eigen_device()) = - result / static_cast(vars.size()); - } - } else if (var0->IsType()) { - auto &slr0 = var0->Get(); - auto *out_slr = out_var->GetMutable(); - out_slr->mutable_rows()->clear(); - out_slr->mutable_value()->mutable_data({{}}, cpu_place); - std::vector inputs; - inputs.reserve(vars.size()); - for (auto &var : vars) { - inputs.push_back(&var->Get()); - } - auto dev_ctx = paddle::platform::CPUDeviceContext(); - if (merge_add) { - math::scatter::MergeAdd merge_add; - merge_add(dev_ctx, inputs, out_slr); - } else { - math::scatter::MergeAverage - merge_average; - merge_average(dev_ctx, inputs, out_slr); - } - - VLOG(3) << "merge " << var_name << " SelectedRows height: " << slr0.height() - << " dims: " << slr0.value().dims() << "; merge add: " << merge_add; - } else { - PADDLE_THROW(platform::errors::InvalidArgument("unsupported var type: %s!", - var0->Type())); - } -} - -using RpcCtxMap = std::unordered_map; -using SparseValue = std::unordered_map>; - -class Communicator { - public: - Communicator(); - - explicit Communicator(const std::map &envs_) { - for (auto &iter : envs_) { - envs[iter.first] = iter.second; - } - } - - virtual ~Communicator() {} - - virtual void Start() = 0; - - virtual void Stop() = 0; - - virtual bool IsRunning() { return running_; } - - virtual void Clean() {} - - virtual void Send(const std::vector &var_names, - const std::vector &var_tables, - const framework::Scope &scope) = 0; - - virtual void RecvNoBarrier() {} - - virtual void Barrier() {} - - virtual void BarrierTriggerDecrement() {} - - virtual void BarrierTriggerReset(int init_counter) {} - - virtual void InitEnvs() = 0; - - virtual void InitImpl(const RpcCtxMap &send_varname_to_ctx, - const RpcCtxMap &recv_varname_to_ctx, - Scope *recv_scope) {} - - static Communicator *GetInstance() { return communicator_.get(); } - - static std::shared_ptr GetInstantcePtr() { - return communicator_; - } - - template - static Communicator *InitInstance( - const RpcCtxMap &send_ctx, const RpcCtxMap &recv_ctx, Scope *recv_scope, - const std::map &envs) { - std::call_once(init_flag_, &Communicator::InitWithRpcCtx, send_ctx, - recv_ctx, recv_scope, std::ref(envs)); - return communicator_.get(); - } - - // Init is called by InitInstance. - template - static void InitWithRpcCtx(const RpcCtxMap &send_ctx, - const RpcCtxMap &recv_ctx, Scope *recv_scope, - const std::map &envs) { - if (communicator_.get() == nullptr) { - communicator_.reset(new T(std::ref(envs))); - communicator_->InitEnvs(); - communicator_->InitImpl(send_ctx, recv_ctx, recv_scope); - } - } - - protected: - bool running_ = false; - bool waiting_ = true; - static std::shared_ptr communicator_; - static std::once_flag init_flag_; - std::unordered_map envs; -}; - -class AsyncCommunicator : public Communicator { - public: - AsyncCommunicator() : Communicator() {} - - explicit AsyncCommunicator(const std::map &envs) - : Communicator(envs) {} - - ~AsyncCommunicator(); - - void InitEnvs() { - min_send_grad_num_before_recv_ = - std::stoi(envs.at("communicator_min_send_grad_num_before_recv")); - thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size")); - max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); - send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times")); - send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size")); - need_global_step_ = - static_cast(std::stoi(envs.at("need_global_step"))); - VLOG(0) << "AsyncCommunicator Initialized"; - } - - void Start() override; - - void Stop() override; - - void InitImpl(const RpcCtxMap &send_varname_to_ctx, - const RpcCtxMap &recv_varname_to_ctx, - Scope *recv_scope) override; - - void InitParams(); - - virtual void MainThread(); - - void Send(const std::vector &var_names, - const std::vector &var_tables, - const framework::Scope &scope) override; - - virtual void SendByCommunicator(); - virtual void SendGlobalStep(int batches); - - virtual void RecvByCommunicator(); - - virtual void RecvNoBarrier(); - - virtual void BarrierSend() {} - - virtual void BarrierRecv() {} - - virtual void BarrierWeakUp() {} - - protected: - int min_send_grad_num_before_recv_; - int thread_pool_size_; - int max_merge_var_num_; - int send_wait_times_; - int send_queue_size_; - int trainer_id_ = 0; - bool need_global_step_ = false; - - std::unordered_map>>> - send_varname_to_queue_; - RpcCtxMap send_varname_to_ctx_; - RpcCtxMap recv_varname_to_ctx_; - std::unique_ptr main_thread_{nullptr}; - Scope *recv_scope_; // should be global scope - std::unique_ptr send_scope_; // an independent scope - std::unique_ptr<::ThreadPool> send_threadpool_{nullptr}; - std::unique_ptr<::ThreadPool> recv_threadpool_{nullptr}; - std::atomic_uint grad_num_{0}; // the num of gradient sent since last recv -}; - -class HalfAsyncCommunicator : public AsyncCommunicator { - public: - HalfAsyncCommunicator() {} - - explicit HalfAsyncCommunicator(const std::map &envs) - : AsyncCommunicator(envs) {} - - void InitEnvs() { - min_send_grad_num_before_recv_ = 0; - - max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); - send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times")); - thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size")); - send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size")); - need_global_step_ = - static_cast(std::stoi(envs.at("need_global_step"))); - VLOG(0) << "HalfAsyncCommunicator Initialized"; - } - - void MainThread() override; - - void SendByCommunicator() override; - - void Clean() override; - - void Barrier() override; - - void BarrierTriggerDecrement() override; - - void BarrierTriggerReset(int initial_val) override; - - int BatchesCounter(); - - void BarrierWeakUp(); - - protected: - // mutex for Wait for barrier - std::mutex barrier_mutex_; - std::condition_variable barrier_cond_; - std::atomic barrier_trigger_{0}; - std::atomic barrier_counter_{0}; -}; - -class SyncCommunicator : public HalfAsyncCommunicator { - public: - SyncCommunicator() : HalfAsyncCommunicator() {} - - explicit SyncCommunicator(const std::map &envs) - : HalfAsyncCommunicator(envs) {} - - void InitEnvs() { - min_send_grad_num_before_recv_ = 0; - - max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); - send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times")); - thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size")); - send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size")); - need_global_step_ = - static_cast(std::stoi(envs.at("need_global_step"))); - - trainer_id_ = std::stoi(envs.at("trainer_id")); - auto pserver_strings = envs.at("pserver_endpoints"); - pserver_endpoints_ = paddle::string::Split(pserver_strings, ','); - VLOG(0) << "SyncCommunicator Initialized"; - } - - void BarrierSend(); - - void BarrierRecv(); - - private: - std::vector pserver_endpoints_{}; -}; - -class GeoCommunicator : public AsyncCommunicator { - public: - GeoCommunicator() : AsyncCommunicator() {} - - explicit GeoCommunicator(const std::map &envs) - : AsyncCommunicator(envs) {} - - void InitImpl(const RpcCtxMap &send_varname_to_ctx, - const RpcCtxMap &recv_varname_to_ctx, - Scope *recv_scope) override; - void MainThread() override; - void InitEnvs() { - min_send_grad_num_before_recv_ = 0; - - max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); - send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times")); - thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size")); - - send_queue_size_ = max_merge_var_num_; - trainers_ = std::stoi(envs.at("trainers")); - sparse_attrs_ = envs.at("sparse_attrs"); - VLOG(0) << "GeoCommunicator Initialized"; - } - - void Send(const std::vector &var_names, - const std::vector &var_tables, - const framework::Scope &scope) override; - - void SendByCommunicator() { return; } - - std::vector MergeSparseIds(const std::string &send_varname); - - void SendSparse(const std::string &varname, int ep_idx, - const std::vector &sparse_ids); - - void SendDense(const std::string &varname); - - void SendGlobalStep(int batches) override {} - - void RecvByCommunicator() override; - - void RecvSparse(const std::string &varname, int ep_idx); - - void RecvDense(const std::string &varname); - - void InitParams(); - - void InitSparse(); - - void InitDense(const std::string varname); - - private: - int trainers_; - std::string sparse_attrs_; - - // parameter for delta calc and send - std::shared_ptr delta_scope_; - - // parameter for storage the pserver param after last recv - std::shared_ptr old_scope_; - - // parameter on pserver - std::shared_ptr pserver_scope_; - - int send_var_nums_ = 0; - - std::unordered_map> old_sparses_; - - std::unordered_map< - std::string, - std::shared_ptr>>>> - sparse_id_queues_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/communicator_common.h b/paddle/fluid/operators/distributed/communicator_common.h deleted file mode 100644 index 122d904eba27aa86fe333312340788dc0aef0d47..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/communicator_common.h +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -namespace paddle { -namespace operators { -namespace distributed { - -struct CommContext { - CommContext() = default; - - CommContext(const std::string &name, const std::vector &names, - const std::vector &emap, - const std::vector §ions, - const std::vector &origin_names, int id, - bool merge_add_ = true, bool is_sparse_ = true, - bool is_distributed_ = false) - : var_name(name), - splited_varnames(names), - epmap(emap), - height_sections(sections), - origin_varnames(origin_names), - trainer_id(id), - merge_add(merge_add_), - is_sparse(is_sparse_), - is_distributed(is_distributed_) {} - - CommContext(const CommContext &ctx) { - var_name = ctx.var_name; - splited_varnames = ctx.splited_varnames; - epmap = ctx.epmap; - height_sections = ctx.height_sections; - trainer_id = ctx.trainer_id; - merge_add = ctx.merge_add; - is_sparse = ctx.is_sparse; - origin_varnames = ctx.origin_varnames; - is_distributed = ctx.is_distributed; - } - - std::string print() const { - std::stringstream ss; - - ss << "varname: " << var_name << " trainer_id: " << trainer_id << " "; - - for (size_t i = 0; i < splited_varnames.size(); i++) { - ss << "slice varname: " << splited_varnames[i] << " ep: " << epmap[i] - << " section: " << height_sections[i] << " "; - } - - ss << "origin varnames: "; - for (size_t i = 0; i < origin_varnames.size(); i++) { - ss << origin_varnames[i] << " "; - } - - ss << " aggregation->add: " << merge_add << " "; - ss << " is_sparse: " << is_sparse << "\n"; - ss << " is_distributed: " << is_distributed << "\n"; - - return ss.str(); - } - - std::string var_name; - std::vector splited_varnames; - std::vector epmap; - std::vector height_sections; - std::vector origin_varnames; - int trainer_id; - bool merge_add; - bool is_sparse; - bool is_distributed; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/communicator_test.cc b/paddle/fluid/operators/distributed/communicator_test.cc deleted file mode 100644 index 38b7c8b00317e6880434e975438c72ba9248aee2..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/communicator_test.cc +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "paddle/fluid/operators/distributed/communicator.h" - -namespace paddle { -namespace operators { -namespace distributed { - -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; - -TEST(communicator, merge_lod_tensors) { - auto cpu_place = platform::CPUPlace(); - auto dims = framework::make_ddim({2, 3}); - std::vector> in_vars; - float out_value = 0; - for (auto i = 0; i < 10; ++i) { - auto var = std::make_shared(); - in_vars.emplace_back(var); - auto *tensor = var->GetMutable(); - auto *data = tensor->mutable_data(dims, cpu_place); - for (auto j = 0; j < tensor->numel(); ++j) { - data[j] = static_cast(i); - } - out_value += static_cast(i); - } - const std::string out_name = "Out"; - std::unique_ptr scope; - scope.reset(new framework::Scope()); - scope->Var(out_name); - for (auto i = 0; i < 10; ++i) { - MergeVars(out_name, in_vars, scope.get()); - } - auto &out_tensor = scope->FindVar(out_name)->Get(); - auto *out_data = out_tensor.data(); - ASSERT_EQ(out_tensor.dims(), dims); - for (auto i = 0; i < out_tensor.numel(); ++i) { - ASSERT_EQ(out_data[i], out_value); - } -} - -TEST(communicator, merge_selected_rows) { - auto cpu_place = platform::CPUPlace(); - int64_t width = 10; - std::vector> in_vars; - const int64_t height = 100; - for (auto i = 0; i < 10; ++i) { - std::vector rows; - for (auto k = 0; k <= i; ++k) { - rows.push_back(k); - } - auto var = std::make_shared(); - in_vars.emplace_back(var); - auto *slr = var->GetMutable(); - slr->set_height(height); - slr->set_rows(rows); - auto dims = - framework::make_ddim({static_cast(rows.size()), width}); - auto *data = slr->mutable_value()->mutable_data(dims, cpu_place); - for (size_t i = 0; i < rows.size(); ++i) { - for (auto j = 0; j < width; ++j) { - data[i * width + j] = static_cast(rows[i]); - } - } - } - const std::string out_name = "Out"; - std::unique_ptr scope; - scope.reset(new framework::Scope()); - scope->Var(out_name); - for (auto i = 0; i < 10; ++i) { - MergeVars(out_name, in_vars, scope.get()); - } - auto &out_slr = scope->FindVar(out_name)->Get(); - auto &out_t = out_slr.value(); - auto *out_data = out_t.data(); - ASSERT_EQ(out_t.dims(), framework::make_ddim({10, width})); - std::vector out_values; - out_values.reserve(10); - for (auto i = 0; i < 10; ++i) { - out_values.push_back(static_cast(i * (10 - i))); - } - for (size_t i = 0; i < out_slr.rows().size(); ++i) { - ASSERT_EQ(out_slr.rows()[i], static_cast(i)); - for (auto j = 0; j < width; ++j) { - ASSERT_EQ(out_data[i * width + j], out_values[i]); - } - } -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/distributed.h b/paddle/fluid/operators/distributed/distributed.h deleted file mode 100644 index 5917c18fb0d20104738c3d5868d419135f5108be..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/distributed.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#ifdef PADDLE_WITH_DISTRIBUTE - -#ifdef PADDLE_WITH_GRPC -#include "paddle/fluid/operators/distributed/communicator.h" - -#include "paddle/fluid/operators/distributed/grpc/grpc_client.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_server.h" -#define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer -#define RPCCLIENT_T paddle::operators::distributed::GRPCClient - -#else // PADDLE_WITH_GRPC - -#include "paddle/fluid/operators/distributed/brpc/brpc_client.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_server.h" -#define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer -#define RPCCLIENT_T paddle::operators::distributed::BRPCClient - -#endif // PADDLE_WITH_GRPC - -#endif // PADDLE_WITH_DISTRIBUTE diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc deleted file mode 100644 index 7d6756b41363d12af68402817cfee1df408b8827..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// NOTE: This file was originally created by tensorflow -// (https://github.com/tensorflow/tensorflow/) we borrow this -// file and did some modifications so that we can send gRPC -// requests without too much copying of the tensor data. - -#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" - -namespace grpc { -class ByteBuffer; -} // namespace grpc - -namespace paddle { -namespace operators { -namespace distributed { - -GrpcByteBufferSource::GrpcByteBufferSource() {} - -bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) { - cur_ = -1; - left_ = 0; - ptr_ = nullptr; - byte_count_ = 0; - bool ok = src.Dump(&slices_).ok(); - if (!ok) { - slices_.clear(); - } - return ok; -} - -bool GrpcByteBufferSource::Next(const void** data, int* size) { - // Use loop instead of if in case buffer contained empty slices. - while (left_ == 0) { - // Advance to next slice. - cur_++; - if (cur_ >= slices_.size()) { - return false; - } - const ::grpc::Slice& s = slices_[cur_]; - left_ = s.size(); - ptr_ = reinterpret_cast(s.begin()); - } - - *data = ptr_; - *size = left_; - byte_count_ += left_; - ptr_ += left_; - left_ = 0; - return true; -} - -void GrpcByteBufferSource::BackUp(int count) { - ptr_ -= count; - left_ += count; - byte_count_ -= count; -} - -bool GrpcByteBufferSource::Skip(int count) { - const void* data; - int size; - while (Next(&data, &size)) { - if (size >= count) { - BackUp(size - count); - return true; - } - // size < count; - count -= size; - } - // error or we have too large count; - return false; -} - -google::protobuf::int64 GrpcByteBufferSource::ByteCount() const { - return byte_count_; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h deleted file mode 100644 index 486870de7a554e675bb01492e775654bbcb34da3..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// NOTE: This file was originally created by tensorflow -// (https://github.com/tensorflow/tensorflow/) we borrow this -// file and did some modifications so that we can send gRPC -// requests without too much copying of the tensor data. - -#pragma once - -#include - -#include "google/protobuf/io/coded_stream.h" -#include "google/protobuf/io/zero_copy_stream.h" -#include "grpc++/grpc++.h" -#include "paddle/fluid/operators/distributed/variable_response.h" - -struct grpc_byte_buffer; - -namespace grpc { -// A ZeroCopyInputStream that reads from grpc_byte_buffer -class ByteBuffer; - -class GrpcBufferReader final - : public ::google::protobuf::io::ZeroCopyInputStream { - typedef void (CoreCodegenInterface::*OldReaderInitAPI)( - grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer); - typedef int (CoreCodegenInterface::*NewReaderInitAPI)( - grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer); - void ReaderInit(OldReaderInitAPI ptr, grpc_byte_buffer_reader* reader, - grpc_byte_buffer* buffer) { - (g_core_codegen_interface->*ptr)(reader, buffer); - } - void ReaderInit(NewReaderInitAPI ptr, grpc_byte_buffer_reader* reader, - grpc_byte_buffer* buffer) { - int result = (g_core_codegen_interface->*ptr)(reader, buffer); - (void)result; - } - - public: - explicit GrpcBufferReader(grpc_byte_buffer* buffer) - : byte_count_(0), backup_count_(0) { - ReaderInit(&CoreCodegenInterface::grpc_byte_buffer_reader_init, &reader_, - buffer); - } - ~GrpcBufferReader() override { - g_core_codegen_interface->grpc_byte_buffer_reader_destroy(&reader_); - } - - bool Next(const void** data, int* size) override { - if (backup_count_ > 0) { - *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) - - backup_count_; - GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX); - *size = static_cast(backup_count_); - backup_count_ = 0; - return true; - } - if (!g_core_codegen_interface->grpc_byte_buffer_reader_next(&reader_, - &slice_)) { - return false; - } - g_core_codegen_interface->grpc_slice_unref(slice_); - *data = GRPC_SLICE_START_PTR(slice_); - // On win x64, int is only 32bit - GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX); - byte_count_ += * size = static_cast(GRPC_SLICE_LENGTH(slice_)); - return true; - } - - void BackUp(int count) override { backup_count_ = count; } - - bool Skip(int count) override { - const void* data; - int size; - while (Next(&data, &size)) { - if (size >= count) { - BackUp(size - count); - return true; - } - // size < count; - count -= size; - } - // error or we have too large count; - return false; - } - - ::google::protobuf::int64 ByteCount() const override { - return byte_count_ - backup_count_; - } - - private: - int64_t byte_count_; - int64_t backup_count_; - grpc_byte_buffer_reader reader_; - grpc_slice slice_; -}; - -}; // namespace grpc - -namespace paddle { -namespace operators { -namespace distributed { - -// A ZeroCopyInputStream that reads from a grpc::ByteBuffer. -class GrpcByteBufferSource - : public ::google::protobuf::io::ZeroCopyInputStream { - public: - GrpcByteBufferSource(); - bool Init(const ::grpc::ByteBuffer& src); // Can be called multiple times. - bool Next(const void** data, int* size) override; - void BackUp(int count) override; - bool Skip(int count) override; - ::google::protobuf::int64 ByteCount() const override; - - private: - std::vector<::grpc::Slice> slices_; - size_t cur_; // Current slice index. - int left_; // Number of bytes in slices_[cur_] left to yield. - const char* ptr_; // Address of next byte in slices_[cur_] to yield. - ::google::protobuf::int64 byte_count_; -}; - -class GrpcByteBufferSourceWrapper : public Source { - public: - explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source) - : source_(source) {} - ::google::protobuf::io::ZeroCopyInputStream* contents() override { - return source_; - } - - private: - GrpcByteBufferSource* source_; -}; - -class GrpcByteSource : public Source { - public: - explicit GrpcByteSource(grpc_byte_buffer* buffer) : buffer_(buffer) {} - ~GrpcByteSource() override { DeleteStream(); } - - typedef ::grpc::GrpcBufferReader Reader; - - ::google::protobuf::io::ZeroCopyInputStream* contents() override { - DeleteStream(); - stream_ = new (&space_) Reader(buffer_); - return stream_; - } - - private: - void DeleteStream() { - if (stream_) { - stream_->~Reader(); - } - } - - grpc_byte_buffer* buffer_; // Not owned - Reader* stream_ = nullptr; // Points into space_ if non-nullptr - char space_[sizeof(Reader)]; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc deleted file mode 100644 index 97a9c14e4f1850990ca7ade55d0c9ddd83b5fbab..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc +++ /dev/null @@ -1,671 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "glog/logging.h" // For VLOG -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_client.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/platform/port.h" -#include "paddle/fluid/platform/profiler.h" - -DEFINE_int32(rpc_client_threads, 2, ""); -DECLARE_bool(rpc_disable_reuse_port); - -namespace paddle { -namespace operators { -namespace distributed { - -void GRPCClient::InitImpl() { - // start the client process thread - // TODO(wuyi): can make this in a threadpool - client_threads_.resize(FLAGS_rpc_client_threads); - for (int i = 0; i < FLAGS_rpc_client_threads; i++) { - client_threads_[i].reset( - new std::thread(std::bind(&GRPCClient::Proceed, this))); - } -} - -void GRPCClient::SendComplete() { - std::unique_lock lk(completed_mutex_); - if (!completed_) { - for (auto& it : channels_) { - VLOG(3) << "send complete message to " << it.first; - this->AsyncSendComplete(it.first); - } - PADDLE_ENFORCE_EQ(this->Wait(), true, platform::errors::PreconditionNotMet( - "internal grpc service error.")); - completed_ = true; - } -} - -GRPCClient::~GRPCClient() { - stopped_ = true; - Wait(); - cq_.Shutdown(); - { - std::lock_guard guard(chan_mutex_); - for (auto& it : channels_) { - it.second.reset(); - } - channels_.clear(); - } - for (size_t i = 0; i < client_threads_.size(); i++) - client_threads_[i]->join(); -} - -VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string var_name_val = var_name; - const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); - const std::string method = kSendRPC; - - int retry_times_ = 0; - - while (true) { - SendProcessor* s = new SendProcessor(ch); - VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); - s->Prepare(h, time_out); - - framework::Async([var_name_val, p_scope, p_ctx, s, method, h, this] { - auto* var = p_scope->FindVar(var_name_val); - - ::grpc::ByteBuffer req; - SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_); - - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - - // stub context - s->response_call_back_ = nullptr; - - platform::RecordRPCEvent record_event(method); - - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, - &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - }); - req_count_++; - - if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) { - h->Wait(); - if (h->should_retry) { - VLOG(3) << "rpc call failed, retry times " << retry_times_; - retry_times_++; - std::random_device rd; - std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5)); - continue; - } - } - - return h; - } -} - -void ProcGetResponse(const VarHandle& var_h, - const ::grpc::ByteBuffer& ret_msg) { - VLOG(4) << "ProcGetResponse"; - framework::Variable* outvar = nullptr; - // get response's trainer_id is not used - int trainer_id; - DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar, - &trainer_id); -} - -void ProcGetRecvResponse(const VarHandle& var_h, - const ::grpc::ByteBuffer& ret_msg) { - VLOG(4) << "ProcGetRecvResponse"; - framework::Variable* outvar = nullptr; - int trainer_id; - DeserializeRecvFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar, - &trainer_id); -} - -template -void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) { - ::grpc::Slice slice(proto.ByteSizeLong()); - proto.SerializeWithCachedSizesToArray(const_cast(slice.begin())); - ::grpc::ByteBuffer tmp(&slice, 1); - result->Swap(&tmp); -} - -VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_varname, - const std::string& table_name, - int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname, - "/sendrecv.SendRecvService/GetVariable", table_name, - time_out); -} - -VarHandlePtr GRPCClient::AsyncGetVarNoBarrier( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - const std::string& out_varname, int64_t time_out) { - std::string var_name_no_barrier = - string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE); - - return _AsyncGetVar( - ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname, - "/sendrecv.SendRecvService/GetVariableNoBarrier", "", time_out); -} - -VarHandlePtr GRPCClient::AsyncGetMonomerVariable( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name, - "/sendrecv.SendRecvService/GetMonomerVariable", "", - time_out); -} - -VarHandlePtr GRPCClient::_AsyncGetVar( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& method, - const std::string& var_name, const std::string& out_varname, - const std::string& rpc_path, const std::string& table_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string var_name_val = var_name; - const std::string out_varname_val = out_varname; - const std::string table_name_val = table_name; - const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); - - int retry_times_ = 0; - - while (true) { - GetProcessor* s = new GetProcessor(ch); - - VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope)); - s->Prepare(h, time_out); - - framework::Async([var_name_val, out_varname_val, table_name_val, s, method, - p_ctx, h, rpc_path, this] { - // prepare input - sendrecv::VariableMessage req; - req.set_varname(var_name_val); - req.set_out_varname(out_varname_val); - req.set_trainer_id(trainer_id_); - req.set_table_name(table_name_val); - ::grpc::ByteBuffer buf; - RequestToByteBuffer(req, &buf); - - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - - // stub context - s->response_call_back_ = ProcGetResponse; - - platform::RecordRPCEvent record_event(method); - - auto call = - s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - }); - req_count_++; - - if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) { - h->Wait(); - if (h->should_retry) { - VLOG(3) << "rpc call failed, retry times " << retry_times_; - retry_times_++; - std::random_device rd; - std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5)); - continue; - } - } - - return h; - } -} - -VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - const std::string& table_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string in_var_name_val = in_var_name; - const std::string out_var_name_val = out_var_name; - const std::string table_name_val = table_name; - const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); - - const std::string method = kPrefetchRPC; - int retry_times_ = 0; - - while (true) { - GetProcessor* s = new GetProcessor(ch); - VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); - s->Prepare(h, kPrefetchTimeout); - - auto* var = p_scope->FindVar(in_var_name_val); - - ::grpc::ByteBuffer req; - SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val, - 0, table_name_val); - - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - - // stub context - s->response_call_back_ = ProcGetResponse; - - platform::RecordRPCEvent record_event(method); - - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req, - &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, static_cast(s)); - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - req_count_++; - - if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) { - h->Wait(); - if (h->should_retry) { - VLOG(3) << "rpc call failed, retry times " << retry_times_; - retry_times_++; - std::random_device rd; - std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5)); - continue; - } - } - - return h; - } -} - -VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out) { - const auto ch = GetChannel(ep); - - BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - const std::string method = kBatchBarrierRPC; - VarHandlePtr h( - new VarHandle(ep, method, BATCH_BARRIER_MESSAGE, nullptr, nullptr)); - s->Prepare(h, time_out); - - sendrecv::VariableMessage req; - req.set_varname(BATCH_BARRIER_MESSAGE); - - platform::RecordRPCEvent record_event(method); - - auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out) { - const auto ch = GetChannel(ep); - FetchBarrierProcessor* s = new FetchBarrierProcessor(ch); - const std::string method = kFetchBarrierRPC; - VarHandlePtr h( - new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); - s->Prepare(h, time_out); - - sendrecv::VariableMessage req; - req.set_varname(FETCH_BARRIER_MESSAGE); - - platform::RecordRPCEvent record_event(method); - - auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep, - const std::string& var_name, - int64_t time_out) { - const auto ch = GetChannel(ep); - BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - const std::string method = kSendMonomerFetchBarrierRPC; - VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr)); - s->Prepare(h, time_out); - - VLOG(30) << s->GetVarHandlePtr()->String() << " begin"; - - sendrecv::VariableMessage req; - req.set_varname(var_name); - - platform::RecordRPCEvent record_event(method); - - auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep, - int64_t time_out) { - const auto ch = GetChannel(ep); - - BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - const std::string method = kSendCompleteRPC; - VarHandlePtr h(new VarHandle(ep, method, COMPLETE_MESSAGE, nullptr, nullptr)); - s->Prepare(h, time_out); - - sendrecv::VariableMessage req; - req.set_trainer_id(trainer_id_); - req.set_varname(COMPLETE_MESSAGE); - - platform::RecordRPCEvent record_event(method); - - auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep, - const std::string& dirname, - const std::string& varname, - const int mode, - int64_t time_out) { - const auto ch = GetChannel(ep); - - CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch); - - const std::string method = kCheckPointNotifyRPC; - - VarHandlePtr h( - new VarHandle(ep, method, CHECKPOINT_SAVE_MESSAGE, nullptr, nullptr)); - s->Prepare(h, time_out); - - sendrecv::VariableMessage req; - req.set_varname(varname); - req.set_table_name(std::to_string(mode)); - req.set_out_varname(dirname); - - platform::RecordRPCEvent record_event(method); - - auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncDistributeNotify( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string var_name_val = var_name; - const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); - const std::string method = kRequestNotify; - - SendProcessor* s = new SendProcessor(ch); - VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); - s->Prepare(h, time_out); - - framework::Async([var_name_val, p_scope, p_ctx, s, method, h, this] { - auto* var = p_scope->FindVar(var_name_val); - - ::grpc::ByteBuffer req; - SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_); - - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - - // stub context - s->response_call_back_ = nullptr; - - platform::RecordRPCEvent record_event(method); - - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/DistributeNotify", req, - &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - }); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncSendAndRecv(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& send_var_name, - const std::string& recv_var_name, - const std::string& table_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string send_var_name_val = send_var_name; - const std::string recv_var_name_val = recv_var_name; - const std::string table_name_val = table_name; - const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); - const std::string method = kSendAndRecvRPC; - VLOG(4) << "GRPCClient::SendAndRecv Begin ,Send_var_name: " - << send_var_name_val << " Recv_var_name: " << recv_var_name_val; - int retry_times_ = 0; - - while (true) { - SendAndRecvProcessor* s = new SendAndRecvProcessor(ch); - VarHandlePtr h( - new VarHandle(ep, method, send_var_name_val, p_ctx, p_scope)); - VarHandlePtr h_recv( - new VarHandle(ep, method, recv_var_name_val, p_ctx, p_scope)); - s->Prepare(h, time_out); - s->RecvPrepare(h_recv); - - framework::Async([send_var_name_val, recv_var_name_val, table_name_val, - p_scope, p_ctx, s, method, h, this] { - auto* send_var = p_scope->FindVar(send_var_name_val); - send_var->GetMutable()->set_lod({}); - ::grpc::ByteBuffer buf; - VLOG(4) << "SerializeToByteBuffer: send_var_name_val: " - << send_var_name_val - << " recv_var_name_val: " << recv_var_name_val; - SerializeToByteBuffer(send_var_name_val, send_var, *p_ctx, &buf, - recv_var_name_val, trainer_id_, table_name_val); - - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - - // stub context - s->response_call_back_ = ProcGetRecvResponse; - - platform::RecordRPCEvent record_event(method); - - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/SendAndRecvVariable", - buf, &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - }); - req_count_++; - - if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) { - h->Wait(); - if (h->should_retry) { - VLOG(3) << "rpc call failed, retry times " << retry_times_; - retry_times_++; - std::random_device rd; - std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5)); - continue; - } - } - - return h; - } -} - -bool GRPCClient::Wait() { - std::unique_lock lk(sync_mutex_); - sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); }); - return ok_; -} - -inline bool ShouldRetry(const std::string& method, int error_code) { - if (method == kPrefetchRPC) { - return true; - } - - if (error_code == grpc::StatusCode::DEADLINE_EXCEEDED) { - return true; - } - - return false; -} - -void GRPCClient::Proceed() { - void* tag = nullptr; - bool ok = false; - - VLOG(3) << "GRPCClient Proceed begin"; - while (!stopped_ && cq_.Next(&tag, &ok)) { - BaseProcessor* c = static_cast(tag); - GPR_ASSERT(ok); - PADDLE_ENFORCE_NOT_NULL( - c, platform::errors::PreconditionNotMet("Make BaseProcessor failed.")); - - if (c->status_.ok()) { - VLOG(3) << c->GetVarHandlePtr()->String() << " process"; - c->Process(); - } else if (ShouldRetry(c->GetVarHandlePtr()->method(), - c->status_.error_code())) { - VLOG(0) << c->GetVarHandlePtr()->String() - << " meets grpc error, error_code:" << c->status_.error_code() - << " error_message:" << c->status_.error_message() - << " error_details:" << c->status_.error_details() - << " should retry!"; - c->GetVarHandlePtr()->should_retry = true; - c->Finish(false); - } else { - PADDLE_THROW(platform::errors::External( - "%s meets grpc error, error_code is %d, error message is %s, error " - "details is %s.", - c->GetVarHandlePtr()->String(), c->status_.error_code(), - c->status_.error_message(), c->status_.error_details())); - c->Finish(false); - } - - bool notify = false; - { - std::lock_guard lk(sync_mutex_); - req_count_--; - notify = (req_count_ <= 0 || !c->status_.ok()); - } - - delete c; - - if (notify) { - sync_cond_.notify_all(); - } - } - - // Last log message - // Avoid using VLOG() and LOG(): in the destructor of google::LogMessage() a - // static Mutex log_mutex is used for synchronization, which might have been - // destructed at this moment. - if (FLAGS_v >= 3) { - std::string msg("GRPCClient Proceed end"); - fwrite(msg.c_str(), msg.length(), 1, stderr); - } -} - -std::shared_ptr GRPCClient::GetChannel(const std::string& ep) { - std::lock_guard guard(chan_mutex_); - auto it = channels_.find(ep); - if (it != channels_.end()) { - return it->second; - } - - // Channel configurations: - grpc::ChannelArguments args; - args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000); - if (FLAGS_rpc_disable_reuse_port) { - args.SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0); - } - args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE); - args.SetMaxSendMessageSize(std::numeric_limits::max()); - args.SetMaxReceiveMessageSize(std::numeric_limits::max()); - - auto ch = - grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args); - channels_[ep] = ch; - return ch; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h deleted file mode 100644 index 5885f944b60a15c2b4810a3968f0ee5406a36a70..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_client.h +++ /dev/null @@ -1,321 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include // NOLINT -#include // NOLINT -#include -#include -#include -#include -#include -#include // NOLINT -#include -#include // NOLINT -#include -#include - -#include "grpc++/channel.h" -#include "grpc++/generic/generic_stub.h" -#include "grpc++/grpc++.h" -#include "grpc++/support/byte_buffer.h" -#include "grpc++/support/slice.h" -#include "grpc/support/log.h" -#include "paddle/fluid/framework/blocking_queue.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/operators/distributed/rpc_client.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN - -namespace grpc { -class Channel; -} // namespace grpc -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg); - -void ProcGetRecvResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg); - -class BaseProcessor { - public: - BaseProcessor() { context_ = nullptr; } - - virtual ~BaseProcessor() {} - - virtual void Prepare(VarHandlePtr h, int64_t time_out) { - var_h_ = h; - - context_.reset(new grpc::ClientContext()); - context_->set_wait_for_ready(true); - if (time_out) { - std::chrono::system_clock::time_point deadline = - std::chrono::system_clock::now() + - std::chrono::milliseconds(time_out); - context_->set_deadline(deadline); - } - } - - void Process() { - ProcessImpl(); - var_h_->Finish(true); - } - - VarHandlePtr GetVarHandlePtr() { return var_h_; } - bool Wait() { return var_h_->Wait(); } - void Finish(bool ok) { return var_h_->Finish(ok); } - virtual void ProcessImpl() = 0; - - std::unique_ptr context_; - grpc::Status status_; - - protected: - VarHandlePtr var_h_; -}; - -typedef std::function - RequestSendCallBack; - -class SendProcessor : public BaseProcessor { - public: - explicit SendProcessor(std::shared_ptr ch) - : BaseProcessor(), stub_g_(ch) {} - - virtual ~SendProcessor() {} - - void ProcessImpl() override { - if (response_call_back_) { - response_call_back_(*var_h_.get(), reply_); - } - } - - ::grpc::GenericStub stub_g_; - ::grpc::ByteBuffer reply_; - RequestSendCallBack response_call_back_ = nullptr; -}; - -typedef std::function - RequestGetCallBack; - -class GetProcessor : public BaseProcessor { - public: - explicit GetProcessor(std::shared_ptr ch) - : BaseProcessor(), stub_g_(ch) {} - - virtual ~GetProcessor() {} - - void ProcessImpl() override { - if (response_call_back_) { - response_call_back_(*var_h_.get(), reply_); - } - } - - ::grpc::ByteBuffer reply_; - ::grpc::GenericStub stub_g_; - RequestGetCallBack response_call_back_ = ProcGetResponse; -}; - -class SendAndRecvProcessor : public BaseProcessor { - public: - explicit SendAndRecvProcessor(std::shared_ptr ch) - : BaseProcessor(), stub_g_(ch) {} - - virtual ~SendAndRecvProcessor() {} - - void ProcessImpl() override { - if (response_call_back_) { - response_call_back_(*var_h_recv_.get(), reply_); - var_h_recv_->Finish(true); - } - } - - void RecvPrepare(VarHandlePtr h_recv) { var_h_recv_ = h_recv; } - - ::grpc::ByteBuffer reply_; - ::grpc::GenericStub stub_g_; - RequestGetCallBack response_call_back_ = ProcGetResponse; - VarHandlePtr var_h_recv_; -}; - -class BatchBarrierProcessor : public BaseProcessor { - public: - explicit BatchBarrierProcessor(std::shared_ptr ch) - : BaseProcessor() { - stub_ = sendrecv::SendRecvService::NewStub(ch); - } - - virtual ~BatchBarrierProcessor() {} - - void ProcessImpl() override {} - sendrecv::VoidMessage reply_; - std::unique_ptr stub_; -}; - -class FetchBarrierProcessor : public BaseProcessor { - public: - explicit FetchBarrierProcessor(std::shared_ptr ch) - : BaseProcessor() { - stub_ = sendrecv::SendRecvService::NewStub(ch); - } - - virtual ~FetchBarrierProcessor() {} - - void ProcessImpl() override {} - sendrecv::VariableMessage reply_; - std::unique_ptr stub_; -}; - -class CheckpointNotifyProcessor : public BaseProcessor { - public: - explicit CheckpointNotifyProcessor(std::shared_ptr ch) - : BaseProcessor() { - stub_ = sendrecv::SendRecvService::NewStub(ch); - } - - virtual ~CheckpointNotifyProcessor() {} - - void ProcessImpl() override {} - sendrecv::VoidMessage reply_; - std::unique_ptr stub_; -}; - -class GRPCClient : public RPCClient { - public: - GRPCClient() : ok_(true), completed_(false), stopped_(false) {} - virtual ~GRPCClient(); - - VarHandlePtr AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_varname, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetVarNoBarrier( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - const std::string& out_varname, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetMonomerVariable( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendBatchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out) override; - - VarHandlePtr AsyncGetMonomerBarrier( - const std::string& ep, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncCheckpointNotify( - const std::string& ep, const std::string& dirname, - const std::string& varname, const int mode, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncDistributeNotify( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendAndRecv(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& send_var_name, - const std::string& recv_var_name, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendComplete( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - - bool Wait() override; - - void SendComplete() override; - - void InitImpl() override; - - private: - void Proceed(); - - std::shared_ptr GetChannel(const std::string& ep); - VarHandlePtr _AsyncGetVar( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& method, - const std::string& var_name, const std::string& out_varname, - const std::string& rpc_path, const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline); - - private: - grpc::CompletionQueue cq_; - std::unordered_map> channels_; - std::vector> client_threads_; - - // mutex for Wait client sync - std::mutex sync_mutex_; - std::condition_variable sync_cond_; - std::atomic req_count_{0}; - bool ok_; - - // mutex for GetChannel thread safety - std::mutex chan_mutex_; - DISABLE_COPY_AND_ASSIGN(GRPCClient); - - // mutex for sending complete message only once - std::mutex completed_mutex_; - bool completed_; - - volatile bool stopped_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc deleted file mode 100644 index 0fc9b6957791490192a142321792b82cf906bfc9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc +++ /dev/null @@ -1,190 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_WITH_NCCL -#include -#endif -#ifdef PADDLE_WITH_RCCL -#include -#endif -#include -#include -#include "grpcpp/impl/codegen/byte_buffer.h" -#include "grpcpp/impl/codegen/slice.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" -#include "paddle/fluid/operators/distributed/proto_encoder_helper.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/profiler.h" - -namespace paddle { -namespace framework { -class Scope; -class Variable; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -void SerializeToByteBuffer(const std::string& name, framework::Variable* var, - const platform::DeviceContext& ctx, - ::grpc::ByteBuffer* msg, const std::string& out_name, - const int trainer_id, - const std::string& table_name) { - platform::RecordRPCEvent record_event("serial"); - VarMsg request; - TensorPayload* payload = nullptr; - - request.set_varname(name); - request.set_trainer_id(trainer_id); - // Note: normally the profiler is enabled in 1 trainer, hence only - // 1 trainer returns true for ShouldSendProfileState(). It tells PS - // servers the trainer's profiling state so that PS can follow the - // trainer. - if (platform::ShouldSendProfileState()) { - if (platform::IsProfileEnabled()) { - request.set_profile(platform::kEnableProfiler); - } else { - request.set_profile(platform::kDisableProfiler); - } - } - if (!out_name.empty()) { - request.set_out_varname(out_name); - } - if (!table_name.empty()) { - request.set_table_name(table_name); - } - if (var->IsType()) { - request.set_type(::sendrecv::LOD_TENSOR); - payload = new TensorPayload(GetTensorPayload(var, ctx, &request)); - } else if (var->IsType()) { - request.set_type(::sendrecv::SELECTED_ROWS); - payload = new TensorPayload(GetSelectedRowsPayload(var, ctx, &request)); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - } else if (var->IsType()) { - request.set_type(::sendrecv::NCCL_ID); -#endif - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Serialize does not support type: %s", typeid(var->Type()).name())); - } - std::string header; - request.AppendToString(&header); - auto buffer = std::unique_ptr(new char[1024]); - void* buf = buffer.get(); - ProtoEncodeHelper e(static_cast(buf), 1024); - e.WriteRawBytes(std::string(header.data(), header.size())); -// NCCLID is copied directly to the message, return bytebuffer -// with only one slice if serializing NCCLID. -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - if (var->IsType()) { - e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, - NCCL_UNIQUE_ID_BYTES); - const ncclUniqueId& uid = var->Get(); - e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES)); - - // for serialize NCCL_ID - ::grpc::Slice slices(e.size()); - memcpy(const_cast(slices.begin()), e.data(), e.size()); - ::grpc::ByteBuffer tmp(&slices, 1); - msg->Swap(&tmp); - return; - } -#endif - PADDLE_ENFORCE_NOT_NULL( - payload, - platform::errors::InvalidArgument( - "Not support type: %s, need to be LOD_TENSOR or SELECTED_ROWS", - var->Type())); - e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, - payload->memory_size()); - if (payload->memory_size() >= std::numeric_limits::max()) { - PADDLE_THROW(platform::errors::InvalidArgument( - "Variable %s length %d should less than %d.", name, - payload->memory_size(), std::numeric_limits::max())); - } - // steal reference of tensor data - ::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows - int num_slices = 2; // only SelectedRows have rows buffer - slices[0] = ::grpc::Slice(e.size()); - memcpy(const_cast(slices[0].begin()), e.data(), e.size()); - slices[1] = ::grpc::Slice( - grpc_slice_new_with_user_data(payload->ptr(), payload->memory_size(), - SerializeDestroyCallback, payload), - ::grpc::Slice::STEAL_REF); - - if (var->IsType()) { - auto* slr = var->GetMutable(); - ProtoEncodeHelper e2(static_cast(buf), 128); - - PADDLE_ENFORCE_EQ(VectorElemName(slr->rows()), typeid(int64_t).name(), - platform::errors::InvalidArgument( - "Got wrong type %s, expect type: int64_t", - VectorElemName(slr->rows()))); - size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); - - e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); - slices[2] = ::grpc::Slice(e2.size()); - memcpy(const_cast(slices[2].begin()), e2.data(), e2.size()); - - slices[3] = ::grpc::Slice( - grpc_slice_new_with_user_data( - const_cast( - reinterpret_cast(slr->rows().data())), - rows_memory_size, [](void* backing) {}, - const_cast( - reinterpret_cast(slr->rows().data()))), - ::grpc::Slice::STEAL_REF); - num_slices = 4; - } - ::grpc::ByteBuffer tmp(&slices[0], num_slices); - msg->Swap(&tmp); -} - -void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id) { - platform::RecordRPCEvent record_event("deserial"); - operators::distributed::GRPCVariableResponse resp(scope, &ctx); - PADDLE_ENFORCE_EQ( - resp.Parse(msg), 0, - platform::errors::InvalidArgument("parse bytebuffer to tensor error!")); - *var = resp.GetVar(); - *trainer_id = resp.GetTrainerId(); -} - -void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id) { - platform::RecordRPCEvent record_event("deserial"); - operators::distributed::GRPCVariableResponse resp(scope, &ctx); - PADDLE_ENFORCE_EQ( - resp.Parse(msg), 0, - platform::errors::InvalidArgument("parse bytebuffer to tensor error!")); - *var = resp.GetRecvVar(); - *trainer_id = resp.GetTrainerId(); -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.h b/paddle/fluid/operators/distributed/grpc/grpc_serde.h deleted file mode 100644 index 932f3e2f069a2bfe1dec9318446e1bf064d2e317..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_serde.h +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/platform/port.h" - -namespace grpc { -class ByteBuffer; -} // namespace grpc -namespace paddle { -namespace framework { -class Scope; -class Variable; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -typedef void (*DestroyCallback)(void*); - -void SerializeToByteBuffer(const std::string& name, framework::Variable* var, - const platform::DeviceContext& ctx, - ::grpc::ByteBuffer* msg, - const std::string& out_varname = std::string(), - const int trainer_id = 0, - const std::string& table_name = std::string()); - -void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id); - -void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id); - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc deleted file mode 100644 index d407a72938a741d9451f46ba067e639e3ada6544..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc +++ /dev/null @@ -1,224 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include // NOLINT - -#include "google/protobuf/text_format.h" -#include "gtest/gtest.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/printf.h" - -namespace framework = paddle::framework; -namespace platform = paddle::platform; -namespace operators = paddle::operators; -namespace math = paddle::operators::math; -namespace memory = paddle::memory; - -void RunSerdeTestSelectedRows(platform::Place place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - // serialize var to ByteBuffer - framework::Variable var; - auto* slr = var.GetMutable(); - slr->set_height(1000); - auto* tensor = slr->mutable_value(); - auto* rows = slr->mutable_rows(); - tensor->Resize(framework::make_ddim({564, 128})); - tensor->mutable_data(place); - int tensor_numel = 564 * 128; - math::set_constant(ctx, tensor, 32.7); - for (int i = 0; i < 564; ++i) rows->push_back(i); - - ::grpc::ByteBuffer msg; - operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg); - EXPECT_GT(msg.Length(), static_cast(0)); - - // deserialize - std::vector<::grpc::Slice> slices; - (void)msg.Dump(&slices); - std::string tmp; - for (const auto& s : slices) { - tmp.append(reinterpret_cast(s.begin()), s.size()); - } - - sendrecv::VariableMessage varmsg; - EXPECT_TRUE(varmsg.ParseFromString(tmp)); - - // deserialize bytebuffer - EXPECT_EQ(varmsg.varname(), "myvar"); - EXPECT_EQ(varmsg.type(), 1); - - const float* tensor_data = - reinterpret_cast(varmsg.serialized().data()); - const int64_t* rows_data = - reinterpret_cast(varmsg.rows().data()); - for (int i = 0; i < tensor_numel; ++i) { - EXPECT_FLOAT_EQ(tensor_data[i], 32.7); - } - for (int i = 0; i < 564; ++i) { - EXPECT_EQ(rows_data[i], i); - } - - // deserialize zero-copy - // framework::Variable var2; - // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2); - framework::Scope scope; - scope.Var("myvar"); - operators::distributed::GRPCVariableResponse resp(&scope, &ctx); - EXPECT_EQ(resp.Parse(msg), 0); - - framework::Variable* var2 = resp.GetVar(); - - auto* slr2 = var2->GetMutable(); - auto* tensor2 = slr2->mutable_value(); - auto* rows2 = slr2->mutable_rows(); - float* tensor_data2 = nullptr; - framework::Tensor tmp_tensor; - - if (platform::is_gpu_place(ctx.GetPlace())) { - platform::CPUPlace cpu; - framework::TensorCopy(*tensor2, cpu, &tmp_tensor); - tensor_data2 = tmp_tensor.data(); - } else { - tensor_data2 = const_cast(tensor2->data()); - } - const int64_t* rows_data2 = rows2->data(); - - for (int i = 0; i < tensor_numel; ++i) { - EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); - } - for (size_t i = 0; i < rows2->size(); ++i) { - EXPECT_EQ(rows_data2[i], static_cast(i)); - } - EXPECT_EQ(slr2->height(), 1000); -} - -void RunTestLodTensor(platform::Place place, int from_type = 0) { - // serialize var to ByteBuffer - framework::Variable var; - auto* tensor = var.GetMutable(); - tensor->Resize(framework::make_ddim({512, 8, 4, 2})); - framework::LoD lod; - lod.push_back(framework::Vector({1, 3, 8})); - tensor->set_lod(lod); - int tensor_numel = 512 * 8 * 4 * 2; - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - tensor->mutable_data(place); - math::set_constant(ctx, tensor, 31.9); - - ::grpc::ByteBuffer msg; - operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg, - "outvar", 0, "table_name"); - EXPECT_GT(msg.Length(), static_cast(0)); - - // deserialize - std::vector<::grpc::Slice> slices; - (void)msg.Dump(&slices); - std::string tmp; - for (const auto& s : slices) { - tmp.append(reinterpret_cast(s.begin()), s.size()); - } - sendrecv::VariableMessage varmsg; - EXPECT_TRUE(varmsg.ParseFromString(tmp)); - EXPECT_EQ(varmsg.varname(), "myvar"); - EXPECT_EQ(varmsg.type(), 0); - EXPECT_EQ(varmsg.dims()[0], 512); - EXPECT_EQ(varmsg.dims()[1], 8); - EXPECT_EQ(varmsg.dims()[2], 4); - EXPECT_EQ(varmsg.dims()[3], 2); - EXPECT_EQ(varmsg.lod_level(), 1); - EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); - EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); - EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); - - const float* tensor_data = - reinterpret_cast(varmsg.serialized().data()); - for (int i = 0; i < tensor_numel; ++i) { - EXPECT_FLOAT_EQ(tensor_data[i], 31.9); - } - - // message binary - std::string str; - varmsg.SerializeToString(&str); - - // message bytebuffer - ::grpc::Slice slices_2[1]; - int num_slices = 1; - slices_2[0] = ::grpc::Slice(str.length()); - memcpy(const_cast(slices_2[0].begin()), str.c_str(), str.length()); - ::grpc::ByteBuffer bytebuffer2(&slices_2[0], num_slices); - - // deserialize zero-copy - framework::Scope scope; - scope.Var("myvar"); - operators::distributed::GRPCVariableResponse resp(&scope, &ctx); - if (from_type == 0) { - EXPECT_EQ(resp.Parse(msg), 0); - } else { - EXPECT_EQ(resp.Parse(bytebuffer2), 0); - } - - framework::Variable* var2 = resp.GetVar(); - - auto tensor2 = var2->Get(); - float* tensor_data2 = nullptr; - framework::Tensor tmp_tensor; - - if (platform::is_gpu_place(ctx.GetPlace())) { - platform::CPUPlace cpu; - framework::TensorCopy(tensor2, cpu, &tmp_tensor); - tensor_data2 = tmp_tensor.data(); - } else { - tensor_data2 = const_cast(tensor2.data()); - } - - EXPECT_EQ(varmsg.lod_level(), 1); - EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); - EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); - EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); - for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); -} - -TEST(LodTensor, Run) { - platform::CPUPlace place; - RunTestLodTensor(place); - RunTestLodTensor(place, 1); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::CUDAPlace gpu(0); - RunTestLodTensor(gpu); - RunTestLodTensor(gpu, 1); -#endif -} - -TEST(SelectedRows, Run) { - platform::CPUPlace place; - RunSerdeTestSelectedRows(place); - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::CUDAPlace gpu; - RunSerdeTestSelectedRows(gpu); -#endif -} diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc deleted file mode 100644 index 912520d782d7568e5a6da63b7ac8bb35a5b95439..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc +++ /dev/null @@ -1,720 +0,0 @@ -/*Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include - -#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_server.h" - -namespace grpc { -class ChannelArguments; -} // namespace grpc -namespace paddle { -namespace framework { -class Variable; -} // namespace framework -namespace operators { -namespace distributed { -class GRPCVariableResponse; -} // namespace distributed -} // namespace operators -} // namespace paddle - -using ::grpc::ServerAsyncResponseWriter; - -DECLARE_bool(rpc_disable_reuse_port); -DECLARE_int32(rpc_retry_bind_port); - -namespace paddle { -namespace operators { -namespace distributed { - -enum CallStatus { PROCESS = 0, FINISH }; - -// reference: -// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server -class RequestBase { - public: - explicit RequestBase(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : service_(service), - cq_(cq), - status_(PROCESS), - request_handler_(request_handler), - req_id_(req_id) { - PADDLE_ENFORCE_NOT_NULL(cq_, platform::errors::InvalidArgument( - "ServerCompletionQueue cq are empty")); - } - virtual ~RequestBase() {} - virtual void Process() = 0; - - std::string Status2String(const std::string& method) { - std::string status = "Process"; - if (status_ == FINISH) { - status = "Finish"; - } - - std::ostringstream s; - s << method << " name:[" << GetReqName() << "]" - << ", ep:[" << ctx_.peer() << "]" - << " " << status << " using req_id:" << req_id_; - return s.str(); - } - - CallStatus Status() const { - std::lock_guard l(status_mu_); - return status_; - } - - template - void Finish(const T& reply, ServerAsyncResponseWriter* responder) { - std::lock_guard l(status_mu_); - status_ = FINISH; - responder->Finish(reply, ::grpc::Status::OK, - reinterpret_cast(static_cast(req_id_))); - } - virtual std::string GetReqName() = 0; - - protected: - mutable std::mutex status_mu_; - ::grpc::ServerContext ctx_; - GrpcService::AsyncService* service_; - ::grpc::ServerCompletionQueue* cq_; - CallStatus status_; - RequestHandler* request_handler_; - int req_id_; -}; - -class RequestSend final : public RequestBase { - public: - explicit RequestSend(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - request_.reset(new GRPCVariableResponse(request_handler->scope(), - request_handler->dev_ctx(), true)); - int method_id = static_cast(distributed::GrpcMethod::kSendVariable); - service_->RequestAsyncUnary( - method_id, &ctx_, request_.get(), &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - virtual ~RequestSend() {} - std::string GetReqName() override { return request_->Varname(); } - - void Process() override { - std::string varname = GetReqName(); - - auto scope = request_->GetMutableLocalScope(); - auto invar = request_->GetVar(); - int trainer_id = request_->GetTrainerId(); - - VLOG(4) << "RequestSend var_name:" << varname << " trainer: " << trainer_id; - - framework::Variable* outvar = nullptr; - request_handler_->Handle(varname, scope, invar, &outvar, trainer_id); - Finish(reply_, &responder_); - } - - protected: - sendrecv::VoidMessage reply_; - std::shared_ptr request_; - ServerAsyncResponseWriter responder_; -}; - -class RequestGet final : public RequestBase { - public: - explicit RequestGet(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - auto method_id = static_cast(distributed::GrpcMethod::kGetVariable); - service_->RequestAsyncUnary( - method_id, &ctx_, &request_, &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestGet() {} - - std::string GetReqName() override { return request_.varname(); } - - void Process() override { - // proc request. - std::string varname = request_.varname(); - std::string out_varname = request_.out_varname(); - std::string table_name = request_.table_name(); - int trainer_id = request_.trainer_id(); - - VLOG(4) << "RequestGet " << out_varname << " from " << varname; - - auto scope = request_handler_->scope(); - framework::Variable* invar = nullptr; - framework::Variable* outvar = nullptr; - - tmp_scope_ = std::move(scope->NewTmpScope()); - request_handler_->Handle(varname, tmp_scope_.get(), invar, &outvar, - trainer_id, out_varname, table_name); - - VLOG(1) << "before SerializeToByteBuffer"; - if (outvar) { - SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(), - &reply_); - } - VLOG(1) << "after SerializeToByteBuffer"; - Finish(reply_, &responder_); - } - - protected: - sendrecv::VariableMessage request_; - ::grpc::ByteBuffer reply_; - std::unique_ptr tmp_scope_; - ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; -}; - -class RequestGetNoBarrier final : public RequestBase { - public: - explicit RequestGetNoBarrier(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - auto method_id = - static_cast(distributed::GrpcMethod::kGetVariableNoBarrier); - service_->RequestAsyncUnary( - method_id, &ctx_, &request_, &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestGetNoBarrier() {} - - std::string GetReqName() override { return request_.varname(); } - - void Process() override { - // proc request. - std::string varname = request_.varname(); - std::string out_varname = request_.out_varname(); - int trainer_id = request_.trainer_id(); - - VLOG(4) << "RequestGetNoBarrier " << out_varname << " from " << varname; - - auto scope = request_handler_->scope(); - framework::Variable* invar = nullptr; - framework::Variable* outvar = nullptr; - - request_handler_->Handle(varname, scope, invar, &outvar, trainer_id, - out_varname); - - if (outvar) { - SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(), - &reply_); - } - Finish(reply_, &responder_); - } - - protected: - sendrecv::VariableMessage request_; - ::grpc::ByteBuffer reply_; - ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; -}; - -class RequestGetMonomerVariable final : public RequestBase { - public: - explicit RequestGetMonomerVariable(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, - int req_id, RPCServer* rpc_server) - : RequestBase(service, cq, request_handler, req_id), - responder_(&ctx_), - rpc_server_(rpc_server) { - auto method_id = - static_cast(distributed::GrpcMethod::kGetMonomerVariable); - service_->RequestAsyncUnary( - method_id, &ctx_, &request_, &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestGetMonomerVariable() {} - - std::string GetReqName() override { return request_.varname(); } - - void Process() override { - // proc request. - std::string varname = request_.varname(); - - rpc_server_->WaitVarCond(varname); - MonomerHandle h = rpc_server_->GetMonomer(varname); - - auto scope = h.scope_; - auto invar = scope->FindVar(varname); - framework::Variable* outvar = nullptr; - - request_handler_->Handle(varname, scope, invar, &outvar, - request_.trainer_id()); - - if (outvar) { - SerializeToByteBuffer(varname, outvar, *h.dev_ctx_, &reply_); - } - Finish(reply_, &responder_); - } - - protected: - sendrecv::VariableMessage request_; - ::grpc::ByteBuffer reply_; - ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; - RPCServer* rpc_server_{nullptr}; -}; - -class RequestGetMonomerBarrier final : public RequestBase { - public: - explicit RequestGetMonomerBarrier(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id, - RPCServer* rpc_server) - : RequestBase(service, cq, request_handler, req_id), - responder_(&ctx_), - rpc_server_(rpc_server) { - auto method_id = - static_cast(distributed::GrpcMethod::kGetMonomerBarrier); - service_->RequestAsyncUnary( - method_id, &ctx_, &request_, &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestGetMonomerBarrier() {} - - std::string GetReqName() override { return request_.varname(); } - - void Process() override { - // proc request. - std::string varname = request_.varname(); - VLOG(4) << "RequestGetMonomerBarrier " << varname; - - rpc_server_->WaitVarCond(varname); - MonomerHandle h = rpc_server_->GetMonomer(varname); - - framework::Scope* scope = nullptr; - framework::Variable* invar = nullptr; - framework::Variable* outvar = nullptr; - - request_handler_->Handle(varname, scope, invar, &outvar, - request_.trainer_id()); - - Finish(reply_, &responder_); - } - - protected: - sendrecv::VariableMessage request_; - sendrecv::VoidMessage reply_; - ServerAsyncResponseWriter responder_; - RPCServer* rpc_server_{nullptr}; -}; - -class RequestPrefetch final : public RequestBase { - public: - explicit RequestPrefetch(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), - responder_(&ctx_), - local_scope_(nullptr) { - request_.reset(new GRPCVariableResponse(request_handler->scope(), - request_handler->dev_ctx(), true)); - int method_id = - static_cast(distributed::GrpcMethod::kPrefetchVariable); - service_->RequestAsyncUnary( - method_id, &ctx_, request_.get(), &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestPrefetch() {} - - std::string GetReqName() override { return request_->Varname(); } - - void Process() override { - // prefetch process... - std::string in_var_name = request_->Varname(); - std::string out_var_name = request_->OutVarname(); - std::string table_name = request_->TableName(); - int trainer_id = request_->GetTrainerId(); - - VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name - << " out_var_name: " << out_var_name << " trainer: " << trainer_id; - - auto scope = request_->GetMutableLocalScope(); - auto invar = scope->FindVar(in_var_name); - // out var must be created in local scope! - framework::Variable* outvar = scope->Var(out_var_name); - - request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id, - out_var_name, table_name); - - SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(), - &reply_); - Finish(reply_, &responder_); - } - - protected: - std::shared_ptr request_; - ::grpc::ByteBuffer reply_; - ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; - framework::Scope* local_scope_; -}; - -class RequestCheckpointNotify final : public RequestBase { - public: - explicit RequestCheckpointNotify(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - request_.reset(new GRPCVariableResponse(request_handler->scope(), - request_handler->dev_ctx())); - int method_id = - static_cast(distributed::GrpcMethod::kCheckpointNotify); - service_->RequestAsyncUnary( - method_id, &ctx_, request_.get(), &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestCheckpointNotify() {} - - std::string GetReqName() override { return request_->Varname(); } - - void Process() override { - auto scope = request_->GetMutableLocalScope(); - - std::string checkpoint_notify = request_->Varname(); - std::string checkpoint_dir = request_->OutVarname(); - int trainer_id = request_->GetTrainerId(); - std::string table_name = request_->TableName(); - - VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify - << ", dir: " << checkpoint_dir; - - request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr, - trainer_id, checkpoint_dir, table_name); - Finish(reply_, &responder_); - } - - protected: - std::shared_ptr request_; - sendrecv::VoidMessage reply_; - ServerAsyncResponseWriter responder_; -}; - -class RequestNotify final : public RequestBase { - public: - explicit RequestNotify(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - request_.reset(new GRPCVariableResponse(request_handler->scope(), - request_handler->dev_ctx(), true)); - int method_id = static_cast(distributed::GrpcMethod::kRequestNotify); - service_->RequestAsyncUnary( - method_id, &ctx_, request_.get(), &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - virtual ~RequestNotify() {} - std::string GetReqName() override { return request_->Varname(); } - - void Process() override { - std::string varname = GetReqName(); - VLOG(4) << "RequestNotify var_name:" << varname; - - auto scope = request_->GetMutableLocalScope(); - auto invar = request_->GetVar(); - int trainer_id = request_->GetTrainerId(); - framework::Variable* outvar = nullptr; - request_handler_->Handle(varname, scope, invar, &outvar, trainer_id); - Finish(reply_, &responder_); - } - - protected: - sendrecv::VoidMessage reply_; - std::shared_ptr request_; - ServerAsyncResponseWriter responder_; -}; - -class RequestSendAndRecv final : public RequestBase { - public: - explicit RequestSendAndRecv(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - request_.reset(new GRPCVariableResponse(request_handler->scope(), - request_handler->dev_ctx(), true)); - - int method_id = - static_cast(distributed::GrpcMethod::kRequestSendAndRecv); - - service_->RequestAsyncUnary( - method_id, &ctx_, request_.get(), &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestSendAndRecv() {} - std::string GetReqName() override { return request_->Varname(); } - - void Process() override { - std::string in_var_name = request_->Varname(); - std::string out_var_name = request_->OutVarname(); - std::string table_name = request_->TableName(); - int trainer_id = request_->GetTrainerId(); - - VLOG(4) << "RequestSendAndRecv, in_var_name: " << in_var_name - << " out_var_name: " << out_var_name << " trainer: " << trainer_id; - auto scope = request_->GetMutableLocalScope(); - auto invar = scope->FindVar(in_var_name); - framework::Variable* outvar = nullptr; - request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id, - out_var_name, table_name); - SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(), - &reply_); - Finish(reply_, &responder_); - } - - protected: - std::shared_ptr request_; - ::grpc::ByteBuffer reply_; - ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; -}; - -void AsyncGRPCServer::WaitServerReady() { - VLOG(4) << "AsyncGRPCServer is waiting server ready"; - std::unique_lock lock(this->mutex_ready_); - condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); - VLOG(4) << "AsyncGRPCServer WaitSeverReady"; -} - -// Define an option subclass in order to disable SO_REUSEPORT for the -// server socket. -// Come from: -// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc -class NoReusePortOption : public ::grpc::ServerBuilderOption { - public: - void UpdateArguments(::grpc::ChannelArguments* args) override { - args->SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0); - } - - void UpdatePlugins(std::vector>* - plugins) override {} -}; - -void AsyncGRPCServer::StartServer() { - for (int i = 0; i < FLAGS_rpc_retry_bind_port; i++) { - ::grpc::ServerBuilder builder; - std::unique_ptr service( - new GrpcService::AsyncService()); - builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(), - &selected_port_); - - builder.SetMaxSendMessageSize(std::numeric_limits::max()); - builder.SetMaxReceiveMessageSize(std::numeric_limits::max()); - if (FLAGS_rpc_disable_reuse_port) { - builder.SetOption( - std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption)); - LOG(INFO) << "set FLAGS_rpc_disable_reuse_port"; - } - builder.RegisterService(service.get()); - - for (auto t : rpc_call_map_) { - rpc_cq_[t.first].reset(builder.AddCompletionQueue().release()); - } - - server_ = builder.BuildAndStart(); - if (selected_port_ != 0) { - LOG(INFO) << "Server listening on " << bind_address_ - << " successful, selected port: " << selected_port_; - service_.reset(service.release()); - break; - } - - LOG(WARNING) << "Server listening on " << bind_address_ - << " failed, selected port: " << selected_port_ - << ", retry after 3 seconds!"; - - sleep(3); - } - - PADDLE_ENFORCE_NE( - selected_port_, 0, - platform::errors::Unavailable("can't bind to address:%s", bind_address_)); - - std::function f = - std::bind(&AsyncGRPCServer::TryToRegisterNewOne, this, - std::placeholders::_1, std::placeholders::_2); - - for (auto& t : rpc_call_map_) { - auto& rpc_name = t.first; - auto& cq = rpc_cq_[rpc_name]; - auto threadnum = rpc_thread_num_[rpc_name]; - auto& reqs = rpc_reqs_[rpc_name]; - - reqs.reserve(kRequestBufSize); - - for (int i = 0; i < kRequestBufSize; i++) { - VLOG(6) << "TryToRegisterNewOne on RPC NAME: " << rpc_name << " I: " << i; - TryToRegisterNewOne(rpc_name, i); - } - - for (int i = 0; i < threadnum; i++) { - rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind( - &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f))); - VLOG(4) << t.first << " creates threads!"; - } - } - - { - std::lock_guard lock(this->mutex_ready_); - ready_ = 1; - } - condition_ready_.notify_all(); - - // wait server - server_->Wait(); - - for (auto& t : rpc_threads_) { - auto& threads = t.second; - for (size_t i = 0; i < threads.size(); ++i) { - threads[i]->join(); - VLOG(4) << t.first << " threads ends!"; - } - } -} - -void AsyncGRPCServer::ShutdownQueue() { - for (auto& t : rpc_cq_) { - t.second->Shutdown(); - VLOG(4) << t.first << " queue shutdown!"; - } -} - -void AsyncGRPCServer::ShutDownImpl() { - std::unique_lock lock(cq_mutex_); - is_shut_down_ = true; - ShutdownQueue(); - - VLOG(4) << "server_ shutdown!"; - server_->Shutdown(); -} - -void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, - int req_id) { - std::unique_lock lock(cq_mutex_); - if (is_shut_down_) { - VLOG(4) << "shutdown, do not TryToRegisterNewSendOne"; - return; - } - - VLOG(4) << "TryToRegisterNewOne on RPC NAME: " << rpc_name - << " REQ ID: " << req_id; - - auto& reqs = rpc_reqs_[rpc_name]; - auto& handler = rpc_call_map_[rpc_name]; - auto& cq = rpc_cq_[rpc_name]; - - RequestBase* b = nullptr; - if (rpc_name == kRequestSend) { - b = new RequestSend(service_.get(), cq.get(), handler, req_id); - } else if (rpc_name == kRequestGet) { - b = new RequestGet(service_.get(), cq.get(), handler, req_id); - - } else if (rpc_name == kRequestGetNoBarrier) { - b = new RequestGetNoBarrier(service_.get(), cq.get(), handler, req_id); - } else if (rpc_name == kRequestGetMonomerVariable) { - b = new RequestGetMonomerVariable(service_.get(), cq.get(), handler, req_id, - this); - } else if (rpc_name == kRequestGetMonomerBarrier) { - b = new RequestGetMonomerBarrier(service_.get(), cq.get(), handler, req_id, - this); - } else if (rpc_name == kRequestPrefetch) { - b = new RequestPrefetch(service_.get(), cq.get(), handler, req_id); - } else if (rpc_name == kRequestCheckpoint) { - b = new RequestCheckpointNotify(service_.get(), cq.get(), handler, req_id); - } else if (rpc_name == kRequestNotify) { - b = new RequestNotify(service_.get(), cq.get(), handler, req_id); - } else if (rpc_name == kRequestSendAndRecv) { - b = new RequestSendAndRecv(service_.get(), cq.get(), handler, req_id); - } else { - PADDLE_THROW( - platform::errors::InvalidArgument("not supported rpc: %s", rpc_name)); - } - - reqs[req_id] = b; - - VLOG(4) << "TryToRegisterNewOne status:" << b->Status(); -} - -void AsyncGRPCServer::HandleRequest( - ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name, - std::function TryToRegisterNewOne) { - void* tag = NULL; - bool ok = false; - - while (true) { - VLOG(4) << "HandleRequest " << rpc_name << " wait next"; - if (!cq->Next(&tag, &ok)) { - VLOG(4) << "CompletionQueue " << rpc_name << " shutdown!"; - break; - } - - int req_id = static_cast(reinterpret_cast(tag)); - VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id - << " get next"; - - auto& reqs = rpc_reqs_[rpc_name]; - RequestBase* base = nullptr; - { - PADDLE_ENFORCE_EQ( - (req_id >= 0 && req_id < kRequestBufSize), true, - platform::errors::OutOfRange("request id: %s out of bounds: [0, %s)", - req_id, kRequestBufSize)); - std::unique_lock lock(cq_mutex_); - base = reqs[req_id]; - } - - VLOG(3) << base->Status2String(rpc_name); - - // reference: - // https://github.com/tensorflow/tensorflow/issues/5596 - // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM - // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I - if (!ok) { - VLOG(4) << "completion queue:" << rpc_name << " recv no regular event" - << " context:" << base->Status2String(rpc_name); - TryToRegisterNewOne(rpc_name, req_id); - delete base; - continue; - } - - switch (base->Status()) { - case PROCESS: { - base->Process(); - break; - } - case FINISH: { - TryToRegisterNewOne(rpc_name, req_id); - delete base; - break; - } - default: { assert(false); } - } - } -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.h b/paddle/fluid/operators/distributed/grpc/grpc_server.h deleted file mode 100644 index 3d68b7e8cebb400680458a1163d52b01f8c8dc2e..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_server.h +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include // NOLINT -#include -#include - -#include "grpc++/grpc++.h" -#include "paddle/fluid/framework/blocking_queue.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_service.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/operators/distributed/rpc_server.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/platform/profiler.h" - -namespace grpc { -class ServerCompletionQueue; -} // namespace grpc - -namespace paddle { -namespace operators { -namespace distributed { - -class RequestBase; - -class AsyncGRPCServer final : public RPCServer { - public: - explicit AsyncGRPCServer(const std::string& address, int client_num) - : RPCServer(address, client_num), ready_(0) {} - - virtual ~AsyncGRPCServer() {} - void WaitServerReady() override; - void StartServer() override; - - private: - // HandleRequest needs to be thread-safe. - void HandleRequest( - ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name, - std::function TryToRegisterNewOne); - - void TryToRegisterNewOne(const std::string& rpc_name, int req_id); - void ShutdownQueue(); - void ShutDownImpl() override; - - private: - static const int kRequestBufSize = 100; - - std::mutex cq_mutex_; - volatile bool is_shut_down_ = false; - - std::unique_ptr service_; - std::unique_ptr<::grpc::Server> server_; - - // condition of the sub program - std::condition_variable barrier_condition_; - - std::mutex mutex_ready_; - std::condition_variable condition_ready_; - - int ready_; - - std::map> rpc_cq_; - std::map>> rpc_threads_; - std::map> rpc_reqs_; -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h deleted file mode 100644 index 10037c90853debb37a08921db2bfc5968dc7094e..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_service.h +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" -#include "paddle/fluid/platform/profiler.h" - -// NOTE: This method was originally created by tensorflow -// (https://github.com/tensorflow/tensorflow/) we borrow this -// method and did some modifications so that we can parse gRPC -// requests without too much copying of the tensor data. - -namespace grpc { -class CompletionQueue; -class Channel; -class RpcService; -class ServerCompletionQueue; -class ServerContext; - -// Support parsing/unparsing of tensorflow::VariableResponse. -// Wire-format is identical to RecvVariableResponse. -template <> -class SerializationTraits< - paddle::operators::distributed::GRPCVariableResponse> { - public: - static Status Serialize( - const paddle::operators::distributed::GRPCVariableResponse& msg, - grpc_byte_buffer** bp, bool* own_buffer) { - PADDLE_THROW(paddle::platform::errors::Unimplemented( - "SerializationTraits::Serialize not implemented!")); - return Status(); - } - static Status Deserialize( - grpc_byte_buffer* buffer, - paddle::operators::distributed::GRPCVariableResponse* msg, - int max_message_size = INT_MAX) { - if (buffer == nullptr) { - return Status(StatusCode::INTERNAL, "No payload"); - } - - Status result = g_core_codegen_interface->ok(); - if (result.ok()) { - paddle::operators::distributed::GrpcByteSource source(buffer); - int ret = msg->Parse(&source); - if (ret != 0) { - result = Status(StatusCode::INTERNAL, "VariableResponse parse error"); - } - } - g_core_codegen_interface->grpc_byte_buffer_destroy(buffer); - return result; - } -}; -} // namespace grpc - -namespace paddle { -namespace operators { -namespace distributed { - -enum class GrpcMethod { - kSendVariable, - kGetVariable, - kPrefetchVariable, - kCheckpointNotify, - kGetVariableNoBarrier, - kGetMonomerVariable, - kGetMonomerBarrier, - kRequestNotify, - kRequestSendAndRecv, - // when you add new handler, change kGrpcNumMethods at the same time! -}; - -static const int kGrpcNumMethods = - static_cast(GrpcMethod::kRequestSendAndRecv) + 1; - -inline const char* GrpcMethodName(GrpcMethod id) { - switch (id) { - case GrpcMethod::kSendVariable: - return "/sendrecv.SendRecvService/SendVariable"; - case GrpcMethod::kGetVariable: - return "/sendrecv.SendRecvService/GetVariable"; - case GrpcMethod::kGetVariableNoBarrier: - return "/sendrecv.SendRecvService/GetVariableNoBarrier"; - case GrpcMethod::kGetMonomerVariable: - return "/sendrecv.SendRecvService/GetMonomerVariable"; - case GrpcMethod::kGetMonomerBarrier: - return "/sendrecv.SendRecvService/GetMonomerBarrier"; - case GrpcMethod::kPrefetchVariable: - return "/sendrecv.SendRecvService/PrefetchVariable"; - case GrpcMethod::kCheckpointNotify: - return "/sendrecv.SendRecvService/CheckpointNotify"; - case GrpcMethod::kRequestNotify: - return "/sendrecv.SendRecvService/DistributeNotify"; - case GrpcMethod::kRequestSendAndRecv: - return "/sendrecv.SendRecvService/SendAndRecvVariable"; - } - - // Shouldn't be reached. - PADDLE_THROW(platform::errors::InvalidArgument( - "Invalid id: not found valid method name")); - return nullptr; -} - -class GrpcService final { - public: - class AsyncService : public ::grpc::Service { - public: - AsyncService() { - for (int i = 0; i < kGrpcNumMethods; ++i) { - AddMethod(new ::grpc::internal::RpcServiceMethod( - GrpcMethodName(static_cast(i)), - ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr)); - ::grpc::Service::MarkMethodAsync(i); - } - } - virtual ~AsyncService() {} - - // Make RequestAsyncUnary public for grpc_call.h - using ::grpc::Service::RequestAsyncUnary; - }; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc deleted file mode 100644 index f7679e9fc924dfe810bf6375787e2ebc4f75dd3e..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc +++ /dev/null @@ -1,344 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include - -#include "google/protobuf/io/coded_stream.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/profiler.h" - -namespace google { -namespace protobuf { -namespace io { -class ZeroCopyInputStream; -} // namespace io -} // namespace protobuf -} // namespace google -namespace grpc { -class ByteBuffer; -} // namespace grpc - -namespace paddle { -namespace operators { -namespace distributed { - -enum WireType { - WIRETYPE_VARINT = 0, - WIRETYPE_LENGTH_DELIMITED = 2, -}; - -inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; } - -inline WireType GetTagWireType(uint32_t tag) { - return static_cast(tag & 0x7); -} - -bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input, - int* result) { - uint64_t v; - if (input->ReadVarint64(&v) && v <= static_cast(INT_MAX)) { - *result = static_cast(v); - return true; - } else { - return false; - } -} - -int GRPCVariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) { - GrpcByteBufferSource source; - source.Init(byte_buffer); - GrpcByteBufferSourceWrapper r(&source); - - return Parse(&r); -} - -bool ParseLodData(::google::protobuf::io::CodedInputStream* input, - std::vector* lod) { - while (true) { - auto p = input->ReadTagWithCutoff(127); - int tag = GetTagFieldNumber(p.first); - WireType wt = GetTagWireType(p.first); - - if (!p.second) { - return (tag == 0); - } - - switch (tag) { - case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: { - uint64_t v; - if (wt == WIRETYPE_VARINT) { - if (!input->ReadVarint64(&v)) { - return false; - } - lod->push_back(v); - break; - } - - if (wt == WIRETYPE_LENGTH_DELIMITED) { - int num_bytes = 0; - if (!input->ReadVarintSizeAsInt(&num_bytes)) { - return tag; - } - int start_pos = input->CurrentPosition(); - while (input->CurrentPosition() - start_pos < num_bytes) { - uint64_t v; - if (!input->ReadVarint64(&v)) { - return tag; - } - lod->push_back(v); - } - break; - } - - return false; - } - default: { return false; } - } - } - - return true; -} - -int GRPCVariableResponse::Parse(Source* source) { - ::google::protobuf::io::ZeroCopyInputStream* input_stream = - source->contents(); - ::google::protobuf::io::CodedInputStream input(input_stream); - input.SetTotalBytesLimit(INT_MAX, INT_MAX); - - while (true) { - auto p = input.ReadTagWithCutoff(127); - int tag = GetTagFieldNumber(p.first); - WireType wt = GetTagWireType(p.first); - if (!p.second) { - if (tag != 0) { - return -1; - } - return 0; - } - - switch (tag) { - case sendrecv::VariableMessage::kVarnameFieldNumber: { - uint32_t length; - if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { - return tag; - } - - std::string temp; - if (!input.ReadString(&temp, length)) { - return tag; - } - - meta_.set_varname(temp); - break; - } - case sendrecv::VariableMessage::kTypeFieldNumber: { - uint32_t v; - if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) { - return tag; - } - - meta_.set_type(static_cast<::sendrecv::VarType>(v)); - break; - } - case sendrecv::VariableMessage::kDataTypeFieldNumber: { - uint32_t v = 0; - if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) { - return tag; - } - - meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v)); - break; - } - case sendrecv::VariableMessage::kDimsFieldNumber: { - // not packed - if (wt == WIRETYPE_VARINT) { - uint64_t v; - if (!input.ReadVarint64(&v)) { - return tag; - } - meta_.add_dims(v); - break; - } - - // packed - if (wt == WIRETYPE_LENGTH_DELIMITED) { - int num_bytes = 0; - if (!input.ReadVarintSizeAsInt(&num_bytes)) { - return tag; - } - int start_pos = input.CurrentPosition(); - while (input.CurrentPosition() - start_pos < num_bytes) { - uint64_t v; - if (!input.ReadVarint64(&v)) { - return tag; - } - meta_.add_dims(v); - } - break; - } - return tag; - } - case sendrecv::VariableMessage::kLodLevelFieldNumber: { - uint64_t v = 0; - if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) { - return tag; - } - meta_.set_lod_level(static_cast(v)); - break; - } - case sendrecv::VariableMessage::kLodFieldNumber: { - int length = 0; - if (wt != WIRETYPE_LENGTH_DELIMITED || - !ReadVarintSizeAsInt(&input, &length)) { - return tag; - } - - std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p = - input.IncrementRecursionDepthAndPushLimit(length); - - std::vector lod_data; - if (p.second < 0 || !ParseLodData(&input, &lod_data)) { - return tag; - } - - if (!input.DecrementRecursionDepthAndPopLimit(p.first)) { - return tag; - } - - if (lod_data.size() == 0) { - break; - } - - auto lod = meta_.add_lod(); - for (uint32_t i = 0; i < lod_data.size(); i++) { - lod->add_lod_data(lod_data[i]); - } - break; - } - case sendrecv::VariableMessage::kSlrHeightFieldNumber: { - uint64_t v = 0; - if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) { - return tag; - } - meta_.set_slr_height(static_cast(v)); - break; - } - case sendrecv::VariableMessage::kSerializedFieldNumber: { - int num_bytes = 0; - if (wt != WIRETYPE_LENGTH_DELIMITED || - !ReadVarintSizeAsInt(&input, &num_bytes)) { - return tag; - } - - if (!ProcSerializedField(tag, &input, num_bytes)) { - return tag; - } - - break; - } - case sendrecv::VariableMessage::kRowsFieldNumber: { - PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || - meta_.type() == sendrecv::LOD_TENSOR) && - meta_.varname() != "", - platform::errors::PreconditionNotMet( - "meta info should be got first!")); - - int num_bytes = 0; - if (wt != WIRETYPE_LENGTH_DELIMITED || - !ReadVarintSizeAsInt(&input, &num_bytes)) { - return tag; - } - - if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) { - return tag; - } - break; - } - case sendrecv::VariableMessage::kOutVarnameFieldNumber: { - uint32_t length; - if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { - return tag; - } - - std::string temp; - if (!input.ReadString(&temp, length)) { - return tag; - } - - meta_.set_out_varname(temp); - break; - } - case sendrecv::VariableMessage::kProfileFieldNumber: { - uint64_t profiling = 0; - if (!input.ReadVarint64(&profiling)) { - return tag; - } - meta_.set_profile(profiling); - int64_t listener_id = platform::ListenerId(); - if (listener_id <= 0) { - break; - } - if (profiling == platform::kEnableProfiler && - !platform::IsProfileEnabled()) { - platform::EnableProfiler(platform::ProfilerState::kCPU); - } else if (profiling == platform::kDisableProfiler && - platform::IsProfileEnabled()) { - platform::DisableProfiler( - platform::EventSortingKey::kDefault, - string::Sprintf("%s_%lld", FLAGS_rpc_server_profile_path, - listener_id)); - } - break; - } - case sendrecv::VariableMessage::kTrainerIdFieldNumber: { - uint64_t trainer_id = 0; - if (!input.ReadVarint64(&trainer_id)) { - return tag; - } - meta_.set_trainer_id(trainer_id); - break; - } - case sendrecv::VariableMessage::kTableNameFieldNumber: { - uint32_t length; - if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { - return tag; - } - - std::string temp; - if (!input.ReadString(&temp, length)) { - return tag; - } - - meta_.set_table_name(temp); - break; - } - default: { - // Unknown tag, return unknown error. - return -1; - } - } - } - - return 0; -} - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h deleted file mode 100644 index 4d12b4a4bacd7ffee6ac7725951b967f7eb2da15..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "google/protobuf/io/coded_stream.h" -#include "google/protobuf/io/zero_copy_stream.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" -#include "paddle/fluid/operators/distributed/variable_response.h" - -namespace grpc { -class ByteBuffer; -} // namespace grpc -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class GRPCVariableResponse : public VariableResponse { - public: - GRPCVariableResponse(const framework::Scope* scope, - const platform::DeviceContext* dev_ctx, - bool create_scope = false) - : VariableResponse(scope, dev_ctx, create_scope) {} - - virtual ~GRPCVariableResponse() {} - - int Parse(Source* source) override; - - // return: - // 0:ok. - // -1: unkown error. - // other: number of error field. - int Parse(const ::grpc::ByteBuffer& byte_buffer); -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor.cc b/paddle/fluid/operators/distributed/heart_beat_monitor.cc deleted file mode 100644 index 9f537f533489860447532e80a08f48ac2750e48c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/heart_beat_monitor.cc +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/heart_beat_monitor.h" - -#include - -namespace paddle { -namespace operators { -namespace distributed { - -DEFINE_int32(worker_update_interval_secs, 900, - " the longest time interval between the worker update variables"); - -inline int GetCurrentUS() { - // current date/time based on current system - time_t t = std::time(0); - int now = static_cast(t); - return now; -} - -void HeartBeatMonitor::Update(const int worker_id, std::string be_monitored_var, - WorkerStatus status) { - if (status == UNINITED) { - LOG(WARNING) << "HeartBeatMonitor receive UNINITED status can not be used " - "in Update, something error"; - } - - if (!is_chief_) { - return; - } - - if ((be_monitored_var == be_monitored_var_ && status == RUNNING) || - status == COMPLETED) { - auto timestamp = GetCurrentUS(); - UnderMonitoredWorker& worker = worker_status_map_.at(worker_id); - - if (worker.status != COMPLETED) { - worker.status = status; - } - worker.timestamp = timestamp; - return; - } -} - -void HeartBeatMonitor::LostWorkerMonitor() { - VLOG(1) << "worker heartbeat monitor start at No.0 parameter server"; - while (running_) { - for (int id = 0; id < workers_; ++id) { - auto& worker = worker_status_map_.at(id); - - if (worker.status == UNINITED) { - VLOG(4) << "worker " << worker.id << " is under UNINITED"; - continue; - } - if (worker.status == COMPLETED) { - VLOG(4) << "worker " << worker.id << " is under COMPLETED"; - continue; - } - - auto timestamp = GetCurrentUS(); - - VLOG(4) << "worker " << worker.id << " status is " << worker.status - << " timestamp is " << worker.timestamp << " the interval is " - << timestamp - worker.timestamp; - - if (timestamp - worker.timestamp >= FLAGS_worker_update_interval_secs) { - PADDLE_THROW(platform::errors::ExecutionTimeout( - "the latest update of worker %d is %d secs ago, we doubt the " - "the worker is not alive and this may have a bad effect on the " - "fitting result, please check", - worker.id, FLAGS_worker_update_interval_secs)); - } - } - - std::this_thread::sleep_for(std::chrono::milliseconds(10 * 1000)); - } - VLOG(1) << "worker heartbeat monitor stopped, thread exit"; -} - -std::once_flag HeartBeatMonitor::init_flag_; -std::unique_ptr HeartBeatMonitor::monitor_(nullptr); - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor.h b/paddle/fluid/operators/distributed/heart_beat_monitor.h deleted file mode 100644 index d96433c318b3578446ffe484d40711d2559f91ab..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/heart_beat_monitor.h +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include // NOLINT -#include -#include -#include // NOLINT -#include -#include -#include -#include -#include "gflags/gflags.h" - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { -namespace distributed { - -enum WorkerStatus { UNINITED = 0, RUNNING, COMPLETED }; - -struct UnderMonitoredWorker { - int id; - WorkerStatus status; - int timestamp; - - UnderMonitoredWorker() {} - - explicit UnderMonitoredWorker(int worker_id) { - this->id = worker_id; - this->status = UNINITED; - this->timestamp = 0; - } -}; - -class HeartBeatMonitor { - public: - explicit HeartBeatMonitor(int workers, bool is_chief, - std::string be_monitored_var) - : workers_(workers), - is_chief_(is_chief), - be_monitored_var_(be_monitored_var), - running_(true) { - PADDLE_ENFORCE_GT(workers, 0, platform::errors::InvalidArgument( - "workers must greater than 0.")); - - for (auto worker_id = 0; worker_id < workers; worker_id++) { - UnderMonitoredWorker worker(worker_id); - worker_status_map_[worker_id] = std::move(worker); - } - - // we define the No.0 pserver is the first parameter server - // only No.0 will check the heartbeat of all trainers - if (is_chief) { - monitor_thread_.reset(new std::thread( - std::bind(&HeartBeatMonitor::LostWorkerMonitor, this))); - } - } - - ~HeartBeatMonitor() { - running_ = false; - if (monitor_thread_) monitor_thread_->join(); - } - - static void Init(int workers, bool is_chief, std::string be_monitored_var) { - std::call_once(init_flag_, &HeartBeatMonitor::InitImpl, workers, is_chief, - be_monitored_var); - } - - static HeartBeatMonitor* GetInstance() { return monitor_.get(); } - - void Stop() { - running_ = false; - if (!monitor_) { - VLOG(0) << "HeartBeatMonitor is not inited, do nothing"; - } else { - if (monitor_thread_) { - monitor_thread_->join(); - monitor_thread_.reset(nullptr); - } - } - } - - void Update(const int worker_id, std::string be_monitored_var, - WorkerStatus status); - - void LostWorkerMonitor(); - - private: - // Init is called by GetInstance. - static void InitImpl(int workers, bool is_chief, - std::string be_monitored_var) { - if (monitor_ == nullptr) { - monitor_.reset(new HeartBeatMonitor(workers, is_chief, be_monitored_var)); - } - } - - static std::once_flag init_flag_; - static std::unique_ptr monitor_; - - int workers_; - bool is_chief_; - std::string be_monitored_var_; - std::unordered_map worker_status_map_; - std::unique_ptr monitor_thread_{nullptr}; - std::mutex mutex_; - bool running_ = false; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc b/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc deleted file mode 100644 index 8505023f63a95d604749be1de787c7688ce27848..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/heart_beat_monitor.h" - -#include "gtest/gtest.h" - -namespace paddle { -namespace operators { -namespace distributed { - -void run(HeartBeatMonitor* monitor) { monitor->LostWorkerMonitor(); } - -TEST(HeartBeatMonitor, All) { - int trainers = 10; - int pserver_id = 0; - std::string var = "nce_w@GRAD.block0"; - std::string var2 = "nce_w@GRAD.block2"; - - HeartBeatMonitor::Init(trainers, pserver_id == 0, var); - - auto* monitor = HeartBeatMonitor::GetInstance(); - - std::vector ids{1, 3, 5, 7}; - - for (auto& id : ids) { - monitor->Update(id, var, RUNNING); - } - - monitor->Update(9, var2, RUNNING); - monitor->Update(2, var, COMPLETED); - - std::thread t(run, monitor); - t.detach(); - - std::this_thread::sleep_for(std::chrono::milliseconds(15 * 1000)); - - monitor->Stop(); -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/large_scale_kv.h b/paddle/fluid/operators/distributed/large_scale_kv.h deleted file mode 100644 index da2281231fc8a339fbc2b4cc0fed841bd24e9645..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/large_scale_kv.h +++ /dev/null @@ -1,848 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include // NOLINT -#include -#include -#include // NOLINT -#include -#include -#include -#include -#include "gflags/gflags.h" - -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/rw_lock.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/port.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/fluid/string/string_helper.h" - -namespace paddle { -namespace operators { -namespace distributed { - -enum Mode { training, infer }; -enum InitType { uniform_random, fill_constant, gaussian_random }; - -inline std::vector bucket(const int v_size, const int b_size) { - int remainder = v_size % b_size; - int bucket = v_size / b_size; - std::vector ret_vec(b_size, bucket); - for (int i = 0; i < remainder; ++i) { - ret_vec[i] = ret_vec[i] + 1; - } - int cur_bucket = 0; - for (int &j : ret_vec) { - int tmp = j; - j = cur_bucket; - cur_bucket += tmp; - } - ret_vec.push_back(cur_bucket); - return ret_vec; -} - -class Initializer { - public: - Initializer() {} - - explicit Initializer(const std::vector &attrs) {} - - virtual float GetValue() = 0; - - virtual ~Initializer() {} - - protected: - std::string name_; - unsigned int seed_; -}; - -class UniformInitializer : public Initializer { - public: - explicit UniformInitializer(const std::vector &attrs) { - name_ = attrs[0]; - seed_ = static_cast(std::stoi(attrs[1])); - min_ = std::stof(attrs[2]); - max_ = std::stof(attrs[3]); - - dist_ = std::uniform_real_distribution(min_, max_); - random_engine_ = framework::GetCPURandomEngine(seed_); - } - - float GetValue() override { return dist_(*random_engine_); } - - private: - float min_; - float max_; - - std::shared_ptr random_engine_; - std::uniform_real_distribution dist_; -}; - -template -inline bool entry(const int count, const T threshold); - -template <> -inline bool entry(const int count, const std::string threshold) { - return true; -} - -template <> -inline bool entry(const int count, const int threshold) { - return count >= threshold; -} - -template <> -inline bool entry(const int count, const float threshold) { - UniformInitializer uniform = UniformInitializer({"0", "0", "1"}); - return uniform.GetValue() >= threshold; -} - -class GaussianInitializer : public Initializer { - public: - explicit GaussianInitializer(const std::vector &attrs) { - name_ = attrs[0]; - seed_ = static_cast(std::stoi(attrs[1])); - mean_ = std::stof(attrs[2]); - std_ = std::stof(attrs[3]); - - random_engine_ = framework::GetCPURandomEngine(seed_); - - dist_ = std::normal_distribution(mean_, std_); - } - - float GetValue() override { return dist_(*random_engine_); } - - private: - float std_; - float mean_; - - std::shared_ptr random_engine_; - std::normal_distribution dist_; -}; - -class FillConstantInitializer : public Initializer { - public: - explicit FillConstantInitializer(const std::vector &attrs) { - name_ = attrs[0]; - value_ = std::stof(attrs[1]); - } - - float GetValue() override { return value_; } - - private: - float value_; -}; - -struct SparseMeta { - std::string name; - std::string grad_name; - std::vector value_names; - std::vector value_dims; - std::vector cached_varnames; - std::vector initializer_attrs; - std::string entry; - Mode mode; - - std::string ToString() { - std::stringstream ss; - ss << "name: " << name << " "; - ss << "mode: " << mode << " "; - - for (int i = 0; i < static_cast(value_names.size()); i++) { - ss << "value_name: " << value_names[i] << " dim: " << value_dims[i] - << " "; - } - - ss << " grad var: " << grad_name; - - ss << " cached varnames: "; - for (int i = 0; i < static_cast(cached_varnames.size()); i++) { - ss << cached_varnames[i] << " "; - } - - ss << " initializer attrs: "; - for (int i = 0; i < static_cast(initializer_attrs.size()); i++) { - ss << initializer_attrs[i] << " "; - } - - ss << " entry attrs: " << entry; - - return ss.str(); - } -}; - -struct VALUE { - explicit VALUE(const std::vector &names) - : names_(names), count_(0), unseen_days_(0) { - values_.resize(names.size()); - for (int i = 0; i < static_cast(names.size()); i++) { - places[names[i]] = i; - } - } - - void set(std::vector> *values) { - values_ = std::move(*values); - } - - void set(const std::vector &names, - const std::vector> &values) { - for (int i = 0; i < static_cast(names.size()); i++) { - auto idx = places[names[i]]; - auto value = values[i]; - values_[idx].assign(value.begin(), value.end()); - } - } - - std::vector *> get() { - auto pts = std::vector *>(); - pts.reserve(values_.size()); - - for (auto &value : values_) { - pts.push_back(&value); - } - return pts; - } - - int fetch_count() { return ++count_; } - void reset_unseen_days() { unseen_days_ = 0; } - - void set_entry(bool is_entry) { is_entry_ = is_entry; } - - bool get_entry() { return is_entry_; } - - std::vector *> get(const std::vector names) { - auto pts = std::vector *>(); - pts.reserve(values_.size()); - - for (int i = 0; i < static_cast(names.size()); i++) { - pts.push_back(&(values_[places[names[i]]])); - } - return pts; - } - - std::vector names_; - int count_; - bool seen_after_last_save_; - int unseen_days_; - bool is_entry_; - std::vector> values_; - std::unordered_map places; -}; - -class ValueBlock { - public: - explicit ValueBlock(const std::vector value_names, - const std::vector value_dims, const Mode &mode, - const std::vector &init_attrs, - const std::string &entry_attr) - : value_names_(value_names), value_dims_(value_dims), mode_(mode) { - // for Initializer - for (size_t i = 0; i < value_names.size(); i++) { - auto name = value_names[i]; - auto slices = string::split_string(init_attrs[i], "&"); - - if (slices[0] == "gaussian_random") { - initializers_[name] = new GaussianInitializer(slices); - } else if (slices[0] == "fill_constant") { - initializers_[name] = new FillConstantInitializer(slices); - } else if (slices[0] == "uniform_random") { - initializers_[name] = new UniformInitializer(slices); - } else { - PADDLE_THROW( - platform::errors::InvalidArgument("%s can not be supported", name)); - } - } - - // for Entry - { - if (entry_attr == "none") { - entry_func_ = - std::bind(entry, std::placeholders::_1, "none"); - } else { - auto slices = string::split_string(entry_attr, "&"); - if (slices[0] == "count_filter") { - int threshold = std::stoi(slices[1]); - entry_func_ = std::bind(entry, std::placeholders::_1, threshold); - } else if (slices[0] == "probability") { - float threshold = std::stof(slices[1]); - entry_func_ = - std::bind(entry, std::placeholders::_1, threshold); - } - } - } - - rwlock_.reset(new framework::RWLock); - } - - ~ValueBlock() { - // for (auto init : initializers_) { - // delete init.second; - // initializers_.erase(init.first); - // } - // - // for (auto value : values_) { - // delete value.second; - // values_.erase(value.first); - // } - } - - void Init(const int64_t &id, std::vector> *values, - int count) { - if (Has(id)) { - PADDLE_THROW(platform::errors::AlreadyExists("id already exist, error")); - } - - if (values->size() != value_names_.size()) { - PADDLE_THROW( - platform::errors::AlreadyExists("values can not match, error")); - } - - auto value = new VALUE(value_names_); - value->set(values); - value->seen_after_last_save_ = true; - value->count_ = count; - values_[id] = value; - } - - std::vector *> Get( - const int64_t &id, const std::vector &value_names) { - rwlock_->RDLock(); - auto ret_values = values_.at(id)->get(value_names); - rwlock_->UNLock(); - return ret_values; - } - - void InitFromInitializer(const int64_t &id, - const std::vector &value_names) { - rwlock_->WRLock(); - - if (Has(id)) { - Update(id); - rwlock_->UNLock(); - return; - } - - auto rets = std::vector>(); - rets.resize(value_names_.size()); - - for (int i = 0; i < static_cast(value_names_.size()); i++) { - auto name = value_names_[i]; - auto *init = initializers_.at(name); - - auto dim = value_dims_[i]; - rets[i].resize(dim); - - for (int j = 0; j < static_cast(dim); j++) { - rets[i][j] = init->GetValue(); - } - } - - Init(id, &rets, 0); - Update(id); - rwlock_->UNLock(); - } - - bool GetEntry(const int64_t &id) { - rwlock_->RDLock(); - auto value = values_.at(id); - auto entry = value->get_entry(); - rwlock_->UNLock(); - return entry; - } - - void Set(const int64_t &id, const std::vector &value_names, - const std::vector> &values) { - rwlock_->WRLock(); - auto value = values_.at(id); - value->set(value_names, values); - rwlock_->UNLock(); - } - - void Update(const int64_t id) { - auto *value = values_.at(id); - value->reset_unseen_days(); - auto count = value->fetch_count(); - - if (!value->get_entry()) { - value->set_entry(entry_func_(count)); - } - } - - private: - bool Has(const int64_t id) { - auto got = values_.find(id); - if (got == values_.end()) { - return false; - } else { - return true; - } - } - - public: - std::unordered_map values_; - - private: - std::vector value_names_; - std::vector value_dims_; - Mode mode_; - std::function entry_func_; - std::unordered_map initializers_; - std::unique_ptr rwlock_{nullptr}; -}; - -class SparseVariable { - public: - explicit SparseVariable(const SparseMeta &meta) { - meta_.name = meta.name; - meta_.mode = meta.mode; - meta_.value_names = meta.value_names; - meta_.value_dims = meta.value_dims; - meta_.grad_name = meta.grad_name; - meta_.cached_varnames = meta.cached_varnames; - meta_.initializer_attrs = meta.initializer_attrs; - meta_.entry = meta.entry; - - for (int i = 0; i < static_cast(meta_.value_names.size()); i++) { - values_dims_[meta_.value_names[i]] = meta_.value_dims[i]; - } - - for (size_t i = 0; i < shard_num_; i++) { - auto block = std::make_shared( - meta.value_names, meta.value_dims, meta.mode, meta.initializer_attrs, - meta.entry); - shard_blocks_.emplace_back(block); - } - - rwlock_.reset(new framework::RWLock); - } - - void Init(const std::vector &ids) { - rwlock_->RDLock(); - for (auto &id : ids) { - auto *block = GetShard(id); - block->InitFromInitializer(id, meta_.value_names); - } - rwlock_->UNLock(); - } - - void Get(const std::vector &ids, - const std::vector &value_names, - std::vector *>> *values) { - values->resize(ids.size()); - - auto buckets = bucket(ids.size(), 8); - std::vector> fs; - - for (int j = 0; j < 8; ++j) { - auto begin = buckets[j]; - auto end = buckets[j + 1]; - - fs.push_back( - framework::Async([begin, end, &values, &ids, &value_names, this]() { - for (int x = begin; x < end; x++) { - auto id = ids[x]; - auto *block = GetShard(id); - auto id_values = block->Get(id, value_names); - (*values)[x] = id_values; - } - })); - } - - for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); - } - - void GetEntry(const std::vector &ids, std::vector *values) { - auto buckets = bucket(ids.size(), 8); - std::vector> fs; - - for (int j = 0; j < 8; ++j) { - auto begin = buckets[j]; - auto end = buckets[j + 1]; - - fs.push_back(framework::Async([begin, end, &values, &ids, this]() { - for (int x = begin; x < end; x++) { - auto id = ids[x]; - auto *block = GetShard(id); - auto is_entry = block->GetEntry(id); - - if (!is_entry) { - values->push_back(id); - } - } - })); - } - for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); - } - - void Set(const std::vector &ids, - const std::vector &value_names, - const std::vector>> &values) { - for (int i = 0; i < static_cast(ids.size()); i++) { - GetShard(ids[i])->Set(ids[i], value_names, values[i]); - } - } - - void Dims(std::vector value_names, std::vector *dims) { - for (auto &name : value_names) { - dims->push_back(values_dims_.at(name)); - } - } - - std::vector CachedVarnames() const { - return meta_.cached_varnames; - } - - void Load(const std::string &dirname) { - rwlock_->WRLock(); - VLOG(1) << "load " << meta_.name << " from dir: " << dirname << " begin"; - - std::vector filenames; - for (auto &value_name : meta_.value_names) { - auto filename = string::Sprintf("%s/%s", dirname, value_name); - filenames.push_back(filename); - } - - LoadFromSelectedRows(filenames, meta_.value_names); - VLOG(1) << "load " << meta_.name << " in dir: " << dirname << " done"; - rwlock_->UNLock(); - } - - void LoadFromSelectedRows(const std::vector &filenames, - const std::vector &valuenames) { - std::vector> variables; - auto place = platform::CPUPlace(); - - for (int i = 0; i < static_cast(filenames.size()); i++) { - auto var = std::make_shared(); - variables.push_back(var); - auto &filename = filenames[i]; - std::ifstream fin(filename, std::ios::binary); - auto *selectedRows = var->GetMutable(); - - platform::DeviceContextPool &pool = - platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - framework::DeserializeFromStream(fin, selectedRows, dev_ctx); - selectedRows->SyncIndex(); - } - - std::vector tensors; - - for (int i = 0; i < static_cast(filenames.size()); i++) { - auto &slr = variables[i]->Get(); - auto src_t = slr.value(); - const auto *value = src_t.data(); - tensors.push_back(value); - } - - for (int i = 1; i < static_cast(filenames.size()); i++) { - auto rows_0 = variables[0]->Get().rows(); - auto rows_i = variables[i]->Get().rows(); - - bool is_equal = std::equal(rows_0.begin(), rows_0.end(), rows_i.begin()); - - if (!is_equal) { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s and %s are not equal, can not be load rightly", filenames[0], - filenames[i])); - } - } - - auto rows = variables[0]->Get().rows(); - - for (auto i = 0; i < static_cast(rows.size()); i++) { - auto id = rows[i]; - std::vector> values; - values.resize(filenames.size()); - - for (int j = 0; j < static_cast(filenames.size()); ++j) { - values[j].resize(meta_.value_dims[j]); - std::memcpy(values[j].data(), tensors[j] + i * meta_.value_dims[j], - sizeof(float) * meta_.value_dims[j]); - } - - auto *block = GetShard(id); - block->Init(id, &values, 0); - block->Update(id); - } - } - - void Save(const std::string &dirname, const int mode = 0) { - rwlock_->WRLock(); - VLOG(3) << "save " << meta_.name << " in dir: " << dirname << " begin"; - - MkDirRecursively(dirname.c_str()); - - std::vector filenames; - for (auto &value_name : meta_.value_names) { - auto filename = string::Sprintf("%s/%s", dirname, value_name); - filenames.push_back(filename); - } - - SaveToSelectedRows(filenames, meta_.value_names, mode); - VLOG(3) << "save " << meta_.name << " in dir: " << dirname << " done"; - rwlock_->UNLock(); - } - - void SaveToSelectedRows(const std::vector &filenames, - const std::vector &valuenames, - const int mode) { - for (auto &value_name : valuenames) { - auto it = std::find(meta_.value_names.begin(), meta_.value_names.end(), - value_name); - if (it == meta_.value_names.end()) { - PADDLE_THROW(platform::errors::InvalidArgument( - "[%s] is invalid param for [%s]", value_name, meta_.name)); - } - } - - auto place = platform::CPUPlace(); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - std::vector ids; - - for (auto &block : shard_blocks_) { - for (auto value : block->values_) { - if (mode == 0) { - ids.push_back(value.first); - } else { - bool id_need_save = false; - // save all params - if (mode == 1) { - id_need_save = true; - } else { - id_need_save = value.second->seen_after_last_save_; - } - - if (id_need_save) { - ids.push_back(value.first); - } - value.second->seen_after_last_save_ = false; - } - } - } - - VLOG(3) << "save " << ids.size() << " feasigns for " << meta_.name - << " with mode: " << mode; - - std::vector> variables; - std::vector tensors; - std::vector dims; - - for (int i = 0; i < static_cast(filenames.size()); i++) { - auto dim = values_dims_.at(valuenames[i]); - auto var = std::make_shared(); - auto *slr = var->GetMutable(); - auto *src_t = slr->mutable_value(); - - src_t->Resize({static_cast(ids.size()), dim}); - auto *value = src_t->mutable_data(place); - - dims.push_back(dim); - variables.push_back(var); - tensors.push_back(value); - } - - std::vector *>> values; - Get(ids, valuenames, &values); - - int64_t offset = 0; - for (auto &vss : values) { - for (int i = 0; i < static_cast(vss.size()); i++) { - auto &vs = vss[i]; - std::memcpy(tensors[i] + offset * dims[i], vs->data(), - sizeof(float) * dims[i]); - } - offset += 1; - } - - for (auto &var : variables) { - auto *slr = var->GetMutable(); - slr->set_rows(ids); - slr->set_height(ids.size()); - } - - for (int i = 0; i < static_cast(filenames.size()); i++) { - auto &filename = filenames[i]; - auto &selectedRows = variables[i]->Get(); - - std::ofstream fout(filename, std::ios::binary); - PADDLE_ENFORCE_EQ(static_cast(fout), true, - platform::errors::Unavailable( - "Cannot open %s to save variables.", filename)); - - framework::SerializeToStream(fout, selectedRows, dev_ctx); - fout.close(); - } - } - - void SaveToText(const std::vector &filenames, - const std::vector &valuenames) { - for (auto &value_name : valuenames) { - auto it = std::find(meta_.value_names.begin(), meta_.value_names.end(), - value_name); - if (it == meta_.value_names.end()) { - PADDLE_THROW(platform::errors::InvalidArgument( - "[%s] is invalid param for [%s]", value_name, meta_.name)); - } - } - - std::vector> fouts; - - for (auto filename : filenames) { - std::unique_ptr fout(new std::ofstream(filename)); - fouts.push_back(std::move(fout)); - } - - for (auto &block : shard_blocks_) { - for (auto value : block->values_) { - std::vector *> vss = value.second->get(valuenames); - - auto id = value.first; - - for (int i = 0; i < static_cast(vss.size()); i++) { - auto &vs = vss[i]; - std::stringstream ss; - ss << id << "\t"; - ss << vs->size() << "\t"; - for (auto v : (*vs)) { - ss << v << " "; - } - ss << "\n"; - - fouts[i]->write(ss.str().c_str(), sizeof(char) * ss.str().size()); - } - } - } - - for (int i = 0; i < static_cast(fouts.size()); i++) { - fouts[i]->close(); - } - } - - int64_t Size() { - int64_t cnt = 0; - - for (auto &block : shard_blocks_) { - cnt += block->values_.size(); - } - return cnt; - } - - ValueBlock *GetShard(const int64_t id) { - return shard_blocks_[id & shard_mask_].get(); - } - - SparseMeta *GetMeta() { return &meta_; } - - private: - std::unique_ptr rwlock_{nullptr}; - - SparseMeta meta_; - std::unordered_map values_dims_; - const size_t shard_mask_ = 127; - const size_t shard_num_ = 128; - std::vector> shard_blocks_; -}; - -class LargeScaleKV { - public: - LargeScaleKV() {} - - explicit LargeScaleKV(const std::vector &table_metas) { - for (auto &sparse_meta : table_metas) { - auto table_name = sparse_meta.name; - auto meta = std::shared_ptr( - new SparseVariable(std::move(sparse_meta))); - sparse_variables[table_name] = meta; - grad_to_variables[sparse_meta.grad_name] = table_name; - grad_names_.push_back(sparse_meta.grad_name); - } - } - - ~LargeScaleKV() {} - - static std::shared_ptr GetInstantcePtr() { return scale_kv_; } - - static LargeScaleKV *GetInstance() { return scale_kv_.get(); } - - static LargeScaleKV *InitInstance( - const std::vector &table_metas) { - std::call_once(init_flag_, &LargeScaleKV::Init, table_metas); - return scale_kv_.get(); - } - - static void Init(const std::vector &table_metas) { - if (scale_kv_.get() == nullptr) { - scale_kv_.reset(new LargeScaleKV(table_metas)); - } - } - - SparseVariable *Get(const std::string &name) { - auto variable = sparse_variables.at(name); - return variable.get(); - } - - bool ParamInLargeScale(const std::string &name) { - auto got = sparse_variables.find(name); - - if (got == sparse_variables.end()) { - return false; - } - - return true; - } - - bool GradInLargeScale(const std::string &name) { - auto got = grad_to_variables.find(name); - - if (got == grad_to_variables.end()) { - return false; - } - - return true; - } - - SparseVariable *GetByGrad(const std::string &name) { - return Get(grad_to_variables[name]); - } - - const std::vector &GetAllGrads() { return grad_names_; } - - private: - std::unordered_map> - sparse_variables; - std::unordered_map grad_to_variables; - std::vector grad_names_; - static std::shared_ptr scale_kv_; - static std::once_flag init_flag_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc deleted file mode 100644 index 558d70e5c3353f98c052cfbcd108f859ea03e7a3..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ /dev/null @@ -1,311 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/parameter_prefetch.h" -#include -#include -#include -#include -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/distributed.h" - -namespace paddle { -namespace framework { -class ExecutionContext; -class Scope; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class RPCClient; - -using LoDTensor = framework::LoDTensor; -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; -using DDim = framework::DDim; - -static void SplitIdsIntoMultipleVarsBySection( - const std::vector &in_ids, - const std::vector &in_varnames, const int tables, - const int pservers, const bool is_distibuted, framework::Scope *scope, - std::vector> *splited_ids, - std::vector> *origin_ids) { - PADDLE_ENFORCE_EQ( - in_varnames.size(), tables, - platform::errors::OutOfRange( - "send varnames size: %d not equal table number: %d, internal error", - in_varnames.size(), tables)); - - PADDLE_ENFORCE_LE( - tables, pservers, - platform::errors::OutOfRange("table number %d not equal or less than " - "pserver number: %d, internal error", - tables, pservers)); - - auto place = platform::CPUPlace(); - - std::set st(in_ids.begin(), in_ids.end()); - std::vector all_ids; - all_ids.assign(st.begin(), st.end()); - - splited_ids->resize(tables); - origin_ids->resize(tables); - - if (is_distibuted) { - for (auto &id : all_ids) { - auto pserver_id = id % pservers; - (*splited_ids)[pserver_id].push_back(id); - (*origin_ids)[pserver_id].push_back(id); - } - } else { - for (auto &id : all_ids) { - auto pserver_id = id % pservers; - (*origin_ids)[pserver_id].push_back(id); - id = id / pservers; - (*splited_ids)[pserver_id].push_back(id); - } - } - - for (size_t i = 0; i < in_varnames.size(); ++i) { - auto *id_tensor = - scope->Var(in_varnames[i])->GetMutable(); - - auto &ids = (*splited_ids)[i]; - if (!ids.empty()) { - auto *id_tensor_data = id_tensor->mutable_data( - framework::make_ddim({static_cast(ids.size()), 1}), place); - memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size()); - } - } -} - -typedef std::vector> TableAndEndpoints; - -void prefetch_core( - const std::vector &ids, const TableAndEndpoints &tables, - const framework::ExecutionContext &context, const framework::Scope &scope, - const bool is_distributed, - std::unordered_map> *recved_vec_map) { - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance( - context.Attr("trainer_id")); - - int pservers = context.Attr("pserver_num"); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &actual_ctx = *pool.Get(platform::CPUPlace()); - - std::unique_ptr local_scope = scope.NewTmpScope(); - - std::vector in_var_names; - std::vector out_var_names; - for (size_t i = 0; i < tables.size(); ++i) { - in_var_names.push_back("prefetch_send@" + tables[i].second); - out_var_names.push_back("prefetch_recv@" + tables[i].second); - } - - std::vector> split_ids; - std::vector> origin_ids; - SplitIdsIntoMultipleVarsBySection(ids, in_var_names, tables.size(), pservers, - is_distributed, local_scope.get(), - &split_ids, &origin_ids); - - // create output var in local scope - for (auto &name : out_var_names) { - local_scope->Var(name)->GetMutable(); - } - - std::vector rets; - for (size_t i = 0; i < in_var_names.size(); i++) { - if (NeedSend(*local_scope.get(), in_var_names[i])) { - VLOG(3) << "sending " << in_var_names[i] << " to " << tables[i].second - << " to get " << out_var_names[i] << " back"; - rets.push_back(rpc_client->AsyncPrefetchVar( - tables[i].second, actual_ctx, *local_scope.get(), in_var_names[i], - out_var_names[i], tables[i].first)); - } else { - VLOG(3) << "don't send no-initialied variable: " << out_var_names[i]; - } - } - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout( - "internal error in RPCClient")); - } - - for (size_t o_idx = 0; o_idx < out_var_names.size(); ++o_idx) { - auto &ids_in_this_section = origin_ids[o_idx]; - - if (!ids_in_this_section.empty()) { - auto &prefetch_out_var = - local_scope->Var(out_var_names[o_idx])->Get(); - const auto *out_var_data = prefetch_out_var.data(); - auto &dims = prefetch_out_var.dims(); - - PADDLE_ENFORCE_EQ(dims.size(), 2, - platform::errors::InvalidArgument( - "The size of Tensor dims must be 2.")); - PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0], - platform::errors::InvalidArgument( - "The size of ids in this section must equal to " - "dims[0]: %s, but got %s", - dims[0], ids_in_this_section.size())); - - auto row_numel = dims[1]; - - for (int64_t i = 0; i < dims[0]; ++i) { - auto origin_id = ids_in_this_section[i]; - std::vector vecs(row_numel); - - std::copy_n(out_var_data + i * row_numel, row_numel, vecs.begin()); - (*recved_vec_map)[origin_id] = vecs; - } - } else { - VLOG(3) << "ids in this section is empty"; - } - } -} - -void prefetch(const std::string &id_name, const std::string &out_name, - const std::string &persistable_var_name, - const bool is_distributed, - const std::vector &table_names, - const std::vector &endpoints, - const framework::ExecutionContext &context, - const framework::Scope &scope) { - prefetchs({id_name}, {out_name}, persistable_var_name, is_distributed, - table_names, endpoints, context, scope); -} - -void prefetchs(const std::vector &id_var_names, - const std::vector &out_var_names, - const std::string &persistable_var_name, - const bool is_distributed, - const std::vector &table_names, - const std::vector &endpoints, - const framework::ExecutionContext &context, - const framework::Scope &scope) { - auto vec_dim_1 = 0; - auto vec_dim_0 = 0; - framework::Variable *var = scope.FindVar(persistable_var_name); - - if (var->IsType()) { - vec_dim_1 = var->Get().value().dims()[1]; - } else { - vec_dim_0 = var->Get().dims()[0]; - vec_dim_1 = var->Get().dims()[1]; - } - - PADDLE_ENFORCE_GT(vec_dim_1, 0, - platform::errors::InvalidArgument( - "lookup table var's dim must gather than 0")); - - const auto place = - scope.FindVar(id_var_names[0])->Get().place(); - - std::vector> ids_group; - std::vector ids_union; - std::vector ids_lods; - TableAndEndpoints tables; - - for (auto &id_name : id_var_names) { - auto &id_tensor = scope.FindVar(id_name)->Get(); - std::vector ids; - TensorToVector(id_tensor, context.device_context(), &ids); - ids_union.insert(ids_union.end(), ids.begin(), ids.end()); - ids_group.push_back(ids); - ids_lods.push_back(id_tensor.lod()); - } - - std::unordered_set s(ids_union.begin(), ids_union.end()); - ids_union.assign(s.begin(), s.end()); - - for (auto &i : ids_union) { - PADDLE_ENFORCE_GE( - i, 0, platform::errors::OutOfRange( - "each element in embedding should be larger or equal 0")); - if (!is_distributed) { - PADDLE_ENFORCE_LT( - i, vec_dim_0, - platform::errors::OutOfRange( - "embedding id must in [0, %d) when is_distributed False", - vec_dim_0)); - } - } - - for (size_t i = 0; i < table_names.size(); i++) { - tables.push_back(std::make_pair(table_names[i], endpoints[i])); - } - std::unordered_map> recved_vec_map; - prefetch_core(ids_union, tables, context, scope, is_distributed, - &recved_vec_map); - - auto padding_idx = distributed::kNoPadding; - - if (context.HasAttr("padding_idx")) { - padding_idx = context.Attr("padding_idx"); - } - - for (size_t i = 0; i < out_var_names.size(); i++) { - std::vector ids = ids_group[i]; - auto ids_size = ids.size(); - auto *out_t = - scope.FindVar(out_var_names[i])->GetMutable(); - out_t->set_lod(ids_lods[i]); - out_t->Resize( - framework::make_ddim({static_cast(ids_size), vec_dim_1})); - auto *out_d = out_t->mutable_data(place); - - if (platform::is_cpu_place(out_t->place())) { - for (auto idx = 0; idx < static_cast(ids_size); idx++) { - const auto &id = ids[idx]; - if (padding_idx != distributed::kNoPadding && id == padding_idx) { - memset(out_d + idx * vec_dim_1, 0, sizeof(float) * vec_dim_1); - } else { - std::copy_n(recved_vec_map[id].begin(), vec_dim_1, - out_d + idx * vec_dim_1); - } - } - } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - std::vector ids_value_vec(ids_size * vec_dim_1); - for (auto idx = 0; idx < static_cast(ids_size); idx++) { - const auto &id = ids[idx]; - if (padding_idx != distributed::kNoPadding && id == padding_idx) { - memset(&ids_value_vec[idx * vec_dim_1], 0, sizeof(float) * vec_dim_1); - } else { - memcpy(&ids_value_vec[idx * vec_dim_1], &recved_vec_map[id][0], - sizeof(float) * vec_dim_1); - } - } - auto &gpu_place = BOOST_GET_CONST(platform::CUDAPlace, out_t->place()); - auto &cpu_place = BOOST_GET_CONST( - platform::CPUPlace, paddle::platform::CPUDeviceContext().GetPlace()); - auto stream = context.cuda_device_context().stream(); - memory::Copy(gpu_place, out_d, cpu_place, &ids_value_vec[0], - sizeof(float) * ids_size * vec_dim_1, stream); -#else - PADDLE_ENFORCE(true, platform::errors::PermissionDenied( - "Paddle is not compiled with GPU!")); -#endif - } - } -} - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h deleted file mode 100644 index 6fd3a998813c0ba32b8b694b6655e1c73f45d62b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/parameter_prefetch.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace framework { -class ExecutionContext; -class Scope; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -constexpr int64_t kNoPadding = -1; - -void prefetchs(const std::vector& id_var_names, - const std::vector& out_var_names, - const std::string& persistable_var_name, const bool backfill, - const std::vector& table_names, - const std::vector& endpoints, - const framework::ExecutionContext& context, - const framework::Scope& scope); - -void prefetch(const std::string& id_name, const std::string& out_name, - const std::string& persistable_var_name, const bool backfill, - const std::vector& table_names, - const std::vector& endpoints, - const framework::ExecutionContext& context, - const framework::Scope& scope); - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc deleted file mode 100644 index d5d3c9c3c7c48fa162e18823bb237901e064315c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/parameter_recv.cc +++ /dev/null @@ -1,248 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include "glog/logging.h" -#include "paddle/fluid/framework/ddim.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/parameter_recv.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { -namespace distributed { - -class RPCClient; - -using LoDTensor = framework::LoDTensor; -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; -using DDim = framework::DDim; - -template -void RecvSparseLodTensor(const CommContext &rpc_ctx, - const framework::Scope &scope) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto cpu_place = platform::CPUPlace(); - auto &cpu_ctx = *pool.Get(cpu_place); - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); - - std::unique_ptr local_scope = scope.NewTmpScope(); - std::vector tensors; - std::vector rets; - std::vector recv_varnames; - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto &recv_var_name = rpc_ctx.splited_varnames[i]; - VLOG(4) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i]; - local_scope->Var(recv_var_name); - // sparse param in recv_scope is LoDTensor - rets.push_back(rpc_client->AsyncGetVarNoBarrier( - rpc_ctx.epmap[i], cpu_ctx, *local_scope.get(), recv_var_name, - recv_var_name)); - recv_varnames.push_back(recv_var_name); - } - - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout( - "internal error in RPCClient")); - auto &recv_var_name = recv_varnames[i]; - auto *local_var = local_scope->FindVar(recv_var_name); - const auto *value = local_var->Get().data(); - tensors.push_back(value); - } - - auto *merged_var = scope.FindVar(rpc_ctx.var_name); - - if (merged_var == nullptr || !merged_var->IsInitialized()) { - PADDLE_THROW( - platform::errors::InvalidArgument("%s must initialized at first.")); - } - auto dims1 = merged_var->Get().dims()[1]; - int64_t height = 0; - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto *splited_var = local_scope->FindVar(rpc_ctx.splited_varnames[i]); - height += splited_var->Get().dims()[0]; - } - - PADDLE_ENFORCE_EQ( - merged_var->Get().dims()[0], height, - platform::errors::InvalidArgument( - "Received variable must has same dimension with local variable.")); - - auto *merged_t = merged_var->GetMutable(); - auto *merged_d = merged_t->mutable_data(cpu_place); - - auto pserver_num = rpc_ctx.splited_varnames.size(); - for (int x = 0; x < height; ++x) { - auto id = x % pserver_num; - auto idx = x / pserver_num; - std::memcpy(merged_d + x * dims1, tensors[id] + idx * dims1, - sizeof(float) * dims1); - } -} - -template -void RecvGeoSparseRecords(const CommContext &rpc_ctx, - const framework::Scope &scope) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto cpu_place = platform::CPUPlace(); - auto &cpu_ctx = *pool.Get(cpu_place); - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); - - std::unique_ptr local_scope = scope.NewTmpScope(); - - std::vector rets; - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto &recv_var_name = rpc_ctx.splited_varnames[i]; - local_scope->Var(recv_var_name); - VLOG(4) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i]; - // sparse param in recv_scope is LoDTensor - rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx, - *local_scope.get(), recv_var_name, - recv_var_name, recv_var_name)); - } - - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout( - "internal error in RPCClient")); - } - - int64_t height = 0; - int64_t ids_num = 0; - int64_t width = 0; - - std::vector all_ids; - auto pserver_num = rpc_ctx.splited_varnames.size(); - - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto &recv_var_name = rpc_ctx.splited_varnames[i]; - auto *recv_var = local_scope->FindVar(recv_var_name); - auto &recv_t = recv_var->Get(); - - height += recv_t.height(); - ids_num += recv_t.rows().size(); - width = recv_t.value().dims()[1]; - - if (rpc_ctx.is_distributed) { - std::copy(recv_t.rows().begin(), recv_t.rows().end(), - std::back_inserter(all_ids)); - } else { - std::transform(recv_t.rows().begin(), recv_t.rows().end(), - std::back_inserter(all_ids), - [&](int64_t id) { return id * pserver_num + i; }); - } - } - - auto *var = scope.FindVar(rpc_ctx.var_name); - auto *t_ = var->GetMutable(); - T *out_data = - t_->mutable_value()->mutable_data({ids_num, width}, cpu_place); - t_->set_height(height); - t_->set_rows(all_ids); - - int64_t cnt = 0; - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto &recv_var_name = rpc_ctx.splited_varnames[i]; - auto *recv_var = local_scope->FindVar(recv_var_name); - auto &recv_t = recv_var->Get(); - - auto rows = recv_t.rows().size(); - const T *in_data = recv_t.value().data(); - std::copy_n(in_data, rows * width, out_data + cnt); - cnt += rows * width; - } - t_->SyncIndex(); -} - -template -void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) { - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); - - std::vector rets; - - // variable do not spilt - if (rpc_ctx.origin_varnames.size() == 1 && - rpc_ctx.splited_varnames.size() == 1) { - auto varname = rpc_ctx.origin_varnames[0]; - const auto place = - scope.FindVar(varname)->Get().place(); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &ctx = *pool.Get(place); - VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0] << " in gpu? " - << platform::is_gpu_place(place); - rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], ctx, - scope, varname, varname)); - - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE( - rets[i]->Wait(), 0U, - platform::errors::ExecutionTimeout("internal error in RPCClient")); - } - - VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name; - return; - } else { - PADDLE_ENFORCE(false, platform::errors::Unimplemented( - "ParameterRecv can not recv dense with multi " - "parts now, add it soon.")); - } -} - -template -void ParameterRecv::operator()(const CommContext &rpc_ctx, - const framework::Scope &scope, - bool geo_records) { - VLOG(3) << "ParameterRecv in " << rpc_ctx.var_name; - - PADDLE_ENFORCE_GE(rpc_ctx.origin_varnames.size(), 1, - platform::errors::InvalidArgument( - "origin_varnames.size() >= 1 is permitted")); - - if (rpc_ctx.is_sparse) { - if (geo_records) { - RecvGeoSparseRecords(rpc_ctx, scope); - } else { - RecvSparseLodTensor(rpc_ctx, scope); - } - } else { - RecvLodTensor(rpc_ctx, scope); - } - - VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name; -} -template -void ParameterRecv::operator()(const CommContext &rpc_ctx, - const framework::Scope &scope) { - this->operator()(rpc_ctx, scope, false); -} - -template struct ParameterRecv; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_recv.h b/paddle/fluid/operators/distributed/parameter_recv.h deleted file mode 100644 index c30d21aa791e23cdebfb35135a292ad846c2576c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/parameter_recv.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" - -namespace paddle { -namespace operators { -namespace distributed { - -template -struct ParameterRecv { - void operator()(const CommContext &rpc_ctx, const framework::Scope &scope, - bool barrier); - - void operator()(const CommContext &rpc_ctx, const framework::Scope &scope); -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc deleted file mode 100644 index 109514ca2541c35b515a357108cd303ae0eeff91..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/parameter_send.cc +++ /dev/null @@ -1,331 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/parameter_send.h" -#include -#include -#include "glog/logging.h" -#include "paddle/fluid/framework/ddim.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace framework { -class Scope; -class Tensor; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class RPCClient; - -using LoDTensor = framework::LoDTensor; -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; -using DDim = framework::DDim; - -typedef std::vector> EP_SPLIT_TABLE_PAIRS; - -inline EP_SPLIT_TABLE_PAIRS GetMultiFieldCommContext( - const CommContext &rpc_ctx, const framework::Scope &scope, - int multi_parts) { - EP_SPLIT_TABLE_PAIRS table_pairs; - - auto *send_var = scope.FindVar(rpc_ctx.var_name); - if (send_var->IsType()) { - PADDLE_ENFORCE_GE(multi_parts, 1, - platform::errors::InvalidArgument( - "multi_parts must == 1 in parameter send, now is: %d", - multi_parts)); - - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - table_pairs.push_back( - std::make_pair(rpc_ctx.epmap[i], rpc_ctx.splited_varnames[i])); - } - - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "GetMultiFieldCommContext unsupported LoDTensor current!")); - } - - return table_pairs; -} // namespace distributed - -void SendByNotifyRPC(const CommContext &rpc_ctx, - const framework::Scope &scope) { - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - auto &send_var_name = rpc_ctx.var_name; - std::vector rets; - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); - - if (NeedSend(scope, send_var_name)) { - for (size_t j = 0; j < rpc_ctx.epmap.size(); j++) { - auto &endpoint = rpc_ctx.epmap[j]; - VLOG(4) << "sending " << send_var_name << " to " << endpoint; - rets.push_back(rpc_client->AsyncDistributeNotify(endpoint, cpu_ctx, scope, - send_var_name)); - VLOG(4) << "send var " << send_var_name << " by notify RPC done"; - } - } else { - VLOG(3) << "don't send non-initialized variable: " << rpc_ctx.var_name; - } - - for (auto &handle : rets) { - PADDLE_ENFORCE_NE(handle->Wait(), 0U, platform::errors::ExecutionTimeout( - "internal error in RPCClient")); - } -} - -template -void ParameterSend::operator()(const CommContext &rpc_ctx, - const framework::Scope &scope, bool sync, - int multi_parts) { - if (rpc_ctx.var_name == STEP_COUNTER) { - SendByNotifyRPC(rpc_ctx, scope); - return; - } - - std::unique_ptr local_scope = scope.NewTmpScope(); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &cpu_ctx = *pool.Get(platform::CPUPlace()); - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); - - std::vector rets; - auto *send_var = scope.FindVar(rpc_ctx.var_name); - - if (send_var->IsType()) { - size_t out_num = rpc_ctx.splited_varnames.size(); - if (out_num > 1) { - auto &send_tensor = send_var->Get(); - auto &send_tensor_dims = send_tensor.dims(); - std::vector outs_dims; - outs_dims.reserve(out_num); - - // infer output shape - PADDLE_ENFORCE_EQ( - rpc_ctx.height_sections.size(), out_num, - platform::errors::InvalidArgument("tensor split sections size" - "should be equal to output size.")); - for (size_t i = 0; i < out_num; ++i) { - auto dim = send_tensor_dims; - dim[0] = rpc_ctx.height_sections[i]; - outs_dims.push_back(dim); - } - - // create output var in local scope - size_t row_offset = 0; - for (size_t i = 0; i < out_num; ++i) { - framework::Tensor *out = local_scope->Var(rpc_ctx.splited_varnames[i]) - ->GetMutable(); - *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]); - row_offset += outs_dims[i][0]; - } - } else { - auto &send_tensor = send_var->Get(); - framework::Tensor *out = local_scope->Var(rpc_ctx.splited_varnames[0]) - ->GetMutable(); - out->ShareDataWith(send_tensor); - } - - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto &send_var_name = rpc_ctx.splited_varnames[i]; - auto &endpoint = rpc_ctx.epmap[i]; - VLOG(4) << " send var name: " << send_var_name - << "endpoint: " << endpoint; - if (NeedSend(*local_scope.get(), send_var_name)) { - VLOG(3) << "sending " << send_var_name << " to " << endpoint; - rets.push_back(rpc_client->AsyncSendVar( - endpoint, cpu_ctx, *local_scope.get(), send_var_name)); - VLOG(4) << "send var " << send_var_name << " async handle done"; - } else { - VLOG(3) << "don't send non-initialized variable: " - << rpc_ctx.splited_varnames[i]; - } - } - } else if (send_var->IsType()) { - auto &send_slr = send_var->Get(); - - auto &send_rows = send_slr.rows(); - if (send_rows.size() == 0) { - LOG(WARNING) - << "WARNING: The variable sent to pserver is empty, which " - "may cause an unknown error. Please check the state of " - "use_double_buffer in pyreader/dataloader async mode, you need to " - "turn it false."; - } - - std::vector> outs_rows_idx; - std::vector> outs_dense_idx; - - auto table_pairs = GetMultiFieldCommContext(rpc_ctx, scope, 1); - outs_rows_idx.resize(table_pairs.size()); - outs_dense_idx.resize(table_pairs.size()); - - auto row_numel = send_slr.value().numel() / send_slr.value().dims()[0]; - auto *src = send_slr.value().data(); - - // create output var in local scope - std::vector outs; - for (auto &table : table_pairs) { - auto *out = - local_scope->Var(table.second)->GetMutable(); - outs.push_back(out); - } - - if (!rpc_ctx.is_distributed) { - auto pserver_num = rpc_ctx.epmap.size(); - - // split rows index into output sparse vars - for (size_t i = 0; i < send_rows.size(); ++i) { - auto ep_idx = send_rows[i] % pserver_num; - auto id = send_rows[i] / pserver_num; - outs_rows_idx[ep_idx].push_back(id); - outs_dense_idx[ep_idx].push_back(i); - } - - auto place = platform::CPUPlace(); - - for (size_t out_idx = 0; out_idx < rpc_ctx.splited_varnames.size(); - out_idx++) { - auto rows_idx = outs_rows_idx[out_idx]; - - auto dims = send_slr.GetCompleteDims(); - dims[0] = rows_idx.size(); - outs[out_idx]->set_height(rpc_ctx.height_sections[out_idx]); - outs[out_idx]->mutable_rows()->clear(); - outs[out_idx]->mutable_value()->mutable_data(dims, send_slr.place()); - - if (rows_idx.size() > 0) { - for (auto idx : rows_idx) { - outs[out_idx]->mutable_rows()->push_back(idx); - } - auto dst = outs[out_idx]->mutable_value()->mutable_data(place); - for (size_t j = 0; j < rows_idx.size(); j++) { - if (platform::is_cpu_place(place)) { - memory::Copy(platform::CPUPlace(), dst + j * row_numel, - platform::CPUPlace(), - src + outs_dense_idx[out_idx][j] * row_numel, - sizeof(T) * row_numel); - } else { - PADDLE_THROW( - platform::errors::Unimplemented("do not support GPU now")); - } - } - } - PADDLE_ENFORCE_EQ( - rows_idx.size(), outs[out_idx]->rows().size(), - platform::errors::InvalidArgument( - "rows should has the same size with tensor dim 0")); - } - } else { - auto pserver_num = rpc_ctx.epmap.size(); - - // split rows index into output sparse vars - for (size_t i = 0; i < send_rows.size(); ++i) { - auto out_idx = send_rows[i] % pserver_num; - outs_rows_idx[out_idx].push_back(send_rows[i]); - outs_dense_idx[out_idx].push_back(i); - } - - auto place = platform::CPUPlace(); - - for (size_t out_idx = 0; out_idx < rpc_ctx.splited_varnames.size(); - out_idx++) { - auto rows_idx = outs_rows_idx[out_idx]; - - auto dims = send_slr.GetCompleteDims(); - dims[0] = rows_idx.size(); - - outs[out_idx]->set_height(rpc_ctx.height_sections[out_idx]); - outs[out_idx]->mutable_rows()->clear(); - outs[out_idx]->mutable_value()->mutable_data(dims, send_slr.place()); - - if (rows_idx.size() > 0) { - for (auto idx : rows_idx) { - outs[out_idx]->mutable_rows()->push_back(idx); - } - auto dst = outs[out_idx]->mutable_value()->mutable_data(place); - for (size_t j = 0; j < rows_idx.size(); j++) { - if (platform::is_cpu_place(place)) { - memory::Copy(platform::CPUPlace(), dst + j * row_numel, - platform::CPUPlace(), - src + outs_dense_idx[out_idx][j] * row_numel, - sizeof(T) * row_numel); - } else { - PADDLE_THROW( - platform::errors::Unimplemented("do not support GPU now")); - } - } - } - PADDLE_ENFORCE_EQ( - rows_idx.size(), outs[out_idx]->rows().size(), - platform::errors::InvalidArgument( - "rows should has the same size with tensor dim 0")); - } - } - - for (size_t i = 0; i < table_pairs.size(); i++) { - auto &send_var_name = table_pairs[i].second; - auto &endpoint = table_pairs[i].first; - auto need_send = NeedSend(*local_scope.get(), send_var_name); - - VLOG(4) << "send var name: " << send_var_name - << " send var endpoint: " << endpoint - << " need send: " << need_send; - - if (need_send) { - VLOG(4) << "sending " << send_var_name << " to " << endpoint; - - rets.push_back(rpc_client->AsyncSendVar( - endpoint, cpu_ctx, *local_scope.get(), send_var_name)); - VLOG(4) << "send var " << send_var_name << " async handle done"; - } else { - VLOG(4) << "don't send non-initialized variable: " - << rpc_ctx.splited_varnames[i]; - } - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "unsupported var type: %s to send!", send_var->Type())); - } - - VLOG(4) << "Prepare to send var " << rpc_ctx.var_name; - if (sync) { - for (auto &handle : rets) { - VLOG(4) << "Wait send var to pserver handle: " << handle; - PADDLE_ENFORCE_NE(handle->Wait(), 0U, platform::errors::ExecutionTimeout( - "internal error in RPCClient")); - } - } -} - -template struct ParameterSend; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h deleted file mode 100644 index cedc98b1fcadd4263bf3aaaec3dd0047c9fb4b36..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/proto_encoder_helper.h +++ /dev/null @@ -1,146 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// NOTE: This file was originally created by tensorflow -// (https://github.com/tensorflow/tensorflow/) we borrow this -// file and did some modifications so that we can send gRPC -// requests without too much copying of the tensor data. - -#pragma once - -#include - -#include "grpc++/grpc++.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { -namespace distributed { - -char* EncodeVarint32(char* dst, uint32_t v) { - // Operate on characters as unsigneds - unsigned char* ptr = reinterpret_cast(dst); - static const int B = 128; - if (v < (1 << 7)) { - *(ptr++) = v; - } else if (v < (1 << 14)) { - *(ptr++) = v | B; - *(ptr++) = v >> 7; - } else if (v < (1 << 21)) { - *(ptr++) = v | B; - *(ptr++) = (v >> 7) | B; - *(ptr++) = v >> 14; - } else if (v < (1 << 28)) { - *(ptr++) = v | B; - *(ptr++) = (v >> 7) | B; - *(ptr++) = (v >> 14) | B; - *(ptr++) = v >> 21; - } else { - *(ptr++) = v | B; - *(ptr++) = (v >> 7) | B; - *(ptr++) = (v >> 14) | B; - *(ptr++) = (v >> 21) | B; - *(ptr++) = v >> 28; - } - return reinterpret_cast(ptr); -} - -char* EncodeVarint64(char* dst, uint64_t v) { - static const int B = 128; - unsigned char* ptr = reinterpret_cast(dst); - while (v >= B) { - *(ptr++) = (v & (B - 1)) | B; - v >>= 7; - } - *(ptr++) = static_cast(v); - return reinterpret_cast(ptr); -} - -int VarintLength(uint64_t v) { - int len = 1; - while (v >= 128) { - v >>= 7; - len++; - } - return len; -} - -class ProtoEncodeHelper { - public: - ProtoEncodeHelper(char* buf, int max_size) - : base_(buf), p_(buf), limit_(base_ + max_size) {} - - ~ProtoEncodeHelper() {} - - const char* data() const { return base_; } - size_t size() const { return p_ - base_; } - - void WriteUint64(int tag, uint64_t v) { - Encode32(combine(tag, WIRETYPE_VARINT)); - Encode64(v); - } - void WriteBool(int tag, bool v) { - Encode32(combine(tag, WIRETYPE_VARINT)); - EncodeBool(v); - } - void WriteString(int tag, const std::string& v) { - Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED)); - Encode32(v.size()); - EncodeBytes(v.data(), v.size()); - } - void WriteVarlengthBeginning(int tag, uint32_t len) { - Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED)); - Encode32(len); - } - void WriteRawBytes(const std::string& v) { EncodeBytes(v.data(), v.size()); } - - private: - // Note: this module's behavior must match the protocol buffer wire encoding - // format. - enum { - WIRETYPE_VARINT = 0, - WIRETYPE_LENGTH_DELIMITED = 2, - }; - static uint32_t combine(uint32_t tag, uint32_t type) { - return ((tag << 3) | type); - } - inline void Encode32(uint32_t v) { - if (v < 128) { - // Fast path for single-byte values. Many of the calls will use a - // constant value for v, so the comparison will get optimized away - // when Encode32 is inlined into the caller. - *p_ = v; - p_++; - } else { - p_ = EncodeVarint32(p_, v); - } - } - void Encode64(uint64_t v) { p_ = EncodeVarint64(p_, v); } - void EncodeBool(bool v) { - *p_ = (v ? 1 : 0); // Equal to varint32 encoding of 0 or 1 - p_++; - } - void EncodeBytes(const char* bytes, int N) { - memcpy(p_, bytes, N); - p_ += N; - } - - char* base_; - char* p_; - char* limit_; // Just for CHECKs -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h deleted file mode 100644 index 44359af1b1b2a6a161adcc83b97ea5fad96eecb0..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/request_handler.h +++ /dev/null @@ -1,261 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include // NOLINT - -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/platform/macros.h" - -namespace paddle { -namespace operators { -namespace distributed { - -constexpr char kRequestSend[] = "RequestSend"; -constexpr char kRequestGet[] = "RequestGet"; -constexpr char kRequestGetMonomerVariable[] = "RequestGetMonomerVariable"; -constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier"; -constexpr char kRequestPrefetch[] = "RequestPrefetch"; -constexpr char kRequestCheckpoint[] = "RequestCheckpoint"; -constexpr char kRequestPassBarrier[] = "RequestPassBarrier"; -constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier"; -constexpr char kRequestNotify[] = "RequestNotify"; -constexpr char kRequestSendAndRecv[] = "RequestSendAndRecv"; - -constexpr char kSendRPC[] = "SendRPC"; -constexpr char kGetRPC[] = "GetRPC"; -constexpr char kGetNoBarrierRPC[] = "GetNoBarrierRPC"; -constexpr char kGetMonomerRPC[] = "GetMonomerRPC"; -constexpr char kPrefetchRPC[] = "PrefetchRPC"; -constexpr char kBatchBarrierRPC[] = "BatchBarrierRPC"; -constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC"; -constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC"; -constexpr char kSendCompleteRPC[] = "SendCompleteRPC"; -constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC"; -constexpr char kSendAndRecvRPC[] = "SendAndRecvRPC"; -constexpr int64_t kPrefetchTimeout = 60000; - -#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" -#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" -#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" -#define COMPLETE_MESSAGE "COMPLETE@RECV" -#define WITHOUT_BARRIER_MESSAGE "@WITHOUT_BARRIER@RECV" -#define LEARNING_RATE_DECAY_COUNTER "@LR_DECAY_COUNTER@" -#define STEP_COUNTER "@PS_STEP_COUNTER@" - -#define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY" -#define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY" - -enum DistributedMode { kSync = 0, kAsync = 1, kHalfAsync = 2, kGeo = 3 }; - -class RPCServer; - -class VarHandle { - public: - VarHandle(const std::string ep, const std::string& method, - const std::string& name, - const platform::DeviceContext* p_ctx = nullptr, - const framework::Scope* p_scope = nullptr) - : status_(kDefaultState) { - ep_ = ep; - ctx_ = p_ctx; - scope_ = p_scope; - name_ = name; - method_ = method; - } - - virtual ~VarHandle() {} - - public: - bool should_retry = false; - - bool Wait() { - int ret = kDefaultState; - { - std::unique_lock lk(sync_mutex_); - wait_cond_.wait(lk, [this] { return status_ != kDefaultState; }); - ret = status_; - } - VLOG(7) << "VarHandle wait:" << ret; - return ret != kErrorState; - } - - void Finish(bool ok) { - { - std::unique_lock lk(sync_mutex_); - status_ = ok ? kFinishState : kErrorState; - } - VLOG(7) << "VarHandle finish:" << ok; - wait_cond_.notify_all(); - } - - std::string String() const { - std::ostringstream s; - s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], status:[" - << status_ << "]"; - return s.str(); - } - - std::string ep() const { return ep_; } - const platform::DeviceContext* ctx() const { return ctx_; } - const framework::Scope* scope() const { return scope_; } - std::string name() const { return name_; } - std::string method() const { return method_; } - - protected: - // RPC endpoint. - std::string ep_; - const platform::DeviceContext* ctx_; - const framework::Scope* scope_; - // Variable name. - std::string name_; - // RPC method name. - std::string method_; - - protected: - std::mutex sync_mutex_; - std::condition_variable wait_cond_; - - enum VarHandleStatus { - kDefaultState = -1, - kErrorState = 0, - kFinishState = 1, - }; - VarHandleStatus status_; - - private: - DISABLE_COPY_AND_ASSIGN(VarHandle); -}; - -typedef std::shared_ptr VarHandlePtr; - -class RequestHandler { - public: - explicit RequestHandler(int distributed_mode) - : distributed_mode_(distributed_mode), - dev_ctx_(nullptr), - executor_(nullptr), - scope_(nullptr), - program_(nullptr), - rpc_server_(nullptr) {} - - virtual ~RequestHandler() {} - - // Set attributes. - void SetScope(framework::Scope* scope) { scope_ = scope; } - void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; } - void SetProgram(framework::ProgramDesc* program) { program_ = program; } - void SetExecutor(framework::Executor* executor) { executor_ = executor; } - - // Used for dist lookup table prefetch - void SetPrefetchPreparedCtx( - std::unordered_map< - std::string, std::shared_ptr>* g) { - prefetch_var_name_to_prepared_ctx_ = g; - } - - void SetCheckpointNotifyPreparedCtx( - std::shared_ptr g) { - checkpoint_prepared_ctx_ = g; - } - - // Used for async. - void SetGradToPreparedCtx( - std::unordered_map< - std::string, std::shared_ptr>* g) { - grad_to_prepared_ctx_ = g; - } - - void SetSparseGradToParam(std::unordered_map* g) { - sparse_grad_to_param_ = g; - } - - void SetLrDecayPreparedCtx( - std::shared_ptr g) { - lr_decay_prepared_ctx_ = g; - } - - void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; } - - // Get attributes. - int distributed_mode() { return distributed_mode_; } - framework::Scope* scope() { return scope_; } - const platform::DeviceContext* dev_ctx() { return dev_ctx_; } - framework::ProgramDesc* program() { return program_; } - framework::Executor* executor() { return executor_; } - - // This function processes user's rpc request. - // The implemention is in request_handler_impl. - // example: - // std::string varname = request_.varname(); - // - // auto scope = request_handler_->scope(); - // auto invar = scope->FindVar(varname); - // framework::Variable* outvar = nullptr; - // - // request_handler_->Handle(varname, scope, invar, &outvar); - // if (outvar) { - // SerializeToByteBuffer(varname, outvar, - // *request_handler_->dev_ctx(), &reply_); - // } - virtual bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, - const std::string& out_var_name = "", - const std::string& table_name = "") = 0; - - protected: - const int distributed_mode_; - - const platform::DeviceContext* dev_ctx_; - framework::Executor* executor_; - framework::Scope* scope_; - framework::ProgramDesc* program_; - - // used for distribute lookup table prefetch - std::unordered_map>* - prefetch_var_name_to_prepared_ctx_; - // used for checkpoint notify - std::shared_ptr checkpoint_prepared_ctx_; - - // Used for async. - std::unordered_map>* - grad_to_prepared_ctx_; - std::unordered_map* sparse_grad_to_param_; - - // used for lr decay - std::shared_ptr lr_decay_prepared_ctx_; - RPCServer* rpc_server_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc deleted file mode 100644 index 8c4f2ef57a32c852f2759e0fc0dcfc63f6d38578..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ /dev/null @@ -1,354 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/request_handler_impl.h" -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/variable_helper.h" -#include "paddle/fluid/operators/distributed/rpc_server.h" -#include "paddle/fluid/string/piece.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/fluid/string/split.h" - -#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" -#include "paddle/fluid/operators/distributed/heart_beat_monitor.h" -#include "paddle/fluid/operators/distributed/large_scale_kv.h" - -namespace paddle { -namespace operators { -namespace distributed { - -// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables -// to directory specified. -constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath"; - -bool RequestSendHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(4) << "RequestSendHandler:" << varname; - - // Sync - if (varname == BATCH_BARRIER_MESSAGE) { - VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE"; - rpc_server_->IncreaseBatchBarrier(kRequestSend); - } else if (varname == COMPLETE_MESSAGE) { - VLOG(3) << "sync: recv complete message"; - - if (HeartBeatMonitor::GetInstance() != nullptr) { - HeartBeatMonitor::GetInstance()->Update(trainer_id, "", COMPLETED); - } - - rpc_server_->Complete(); - } else { - // Async - if (distributed_mode_ != DistributedMode::kSync) { - VLOG(3) << "async process var: " << varname; - if (varname == BATCH_BARRIER_MESSAGE) { - PADDLE_THROW(platform::errors::InvalidArgument( - "async mode should not recv BATCH_BARRIER_MESSAGE or " - "COMPLETE_MESSAGE")); - } - HeartBeatMonitor::GetInstance()->Update(trainer_id, varname, RUNNING); - - std::string run_varname = varname; - - string::Piece part_piece("@PIECE"); - string::Piece var_name_piece = string::Piece(varname); - - if (string::Contains(var_name_piece, part_piece)) { - auto varname_splits = paddle::string::Split(varname, '@'); - PADDLE_ENFORCE_EQ( - varname_splits.size(), 3, - platform::errors::InvalidArgument( - "varname: %s should be separated into 3 parts by @", varname)); - run_varname = varname_splits[0]; - scope->Rename(varname, run_varname); - } - - auto *var = scope->FindVar(run_varname); - - // for sparse ids - if (var->IsType()) { - if (distributed_mode_ == DistributedMode::kAsync || - distributed_mode_ == DistributedMode::kHalfAsync) { - auto *ins = distributed::LargeScaleKV::GetInstance(); - if (ins->GradInLargeScale(run_varname)) { - auto *large_scale_var = ins->GetByGrad(run_varname); - - for (auto name : large_scale_var->CachedVarnames()) { - scope->Var(name); - } - } - } - if (distributed_mode_ == DistributedMode::kGeo) { - if (AsyncSparseParamUpdateRecorder::GetInstance()->HasGrad( - run_varname)) { - auto &grad_slr = - scope->FindVar(run_varname)->Get(); - AsyncSparseParamUpdateRecorder::GetInstance()->Update( - run_varname, grad_slr.rows()); - } - } - } - - executor_->RunPreparedContext((*grad_to_prepared_ctx_)[run_varname].get(), - scope); - return true; - } else { // sync - rpc_server_->WaitCond(kRequestSend); - VLOG(3) << "sync: processing received var: " << varname; - PADDLE_ENFORCE_NOT_NULL( - invar, platform::errors::NotFound( - "sync: Can not find server side var %s.", varname)); - } - } - return true; -} - -bool RequestGetHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(3) << "RequestGetHandler:" << varname - << " out_var_name: " << out_var_name << " trainer_id: " << trainer_id - << " table_name: " << table_name; - - if (distributed_mode_ == DistributedMode::kSync) { - if (varname == FETCH_BARRIER_MESSAGE) { - VLOG(3) << "sync: recv fetch barrier message"; - rpc_server_->IncreaseBatchBarrier(kRequestGet); - } else { - rpc_server_->WaitCond(kRequestGet); - *outvar = scope_->FindVar(varname); - } - } else { - if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) { - if (enable_dc_asgd_) { - // NOTE: the format is determined by distribute_transpiler.py - std::string param_bak_name = - string::Sprintf("%s.trainer_%d_bak", varname, trainer_id); - VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id; - auto var = scope_->FindVar(varname); - auto t_orig = var->Get(); - auto param_bak = scope_->Var(param_bak_name); - auto t = param_bak->GetMutable(); - t->mutable_data(dev_ctx_->GetPlace(), t_orig.type()); - VLOG(3) << "copying " << varname << " to " << param_bak_name; - framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t); - } - - if (distributed_mode_ == DistributedMode::kGeo && - AsyncSparseParamUpdateRecorder::GetInstance()->HasParam(varname) && - !table_name.empty()) { - VLOG(3) << "AsyncSparseParamUpdateRecorder " << varname << " exist "; - - std::vector updated_rows; - AsyncSparseParamUpdateRecorder::GetInstance()->GetAndClear( - varname, trainer_id, &updated_rows); - - if (VLOG_IS_ON(3)) { - std::ostringstream sstream; - sstream << "["; - for (auto &row_id : updated_rows) { - sstream << row_id << ", "; - } - sstream << "]"; - VLOG(3) << "updated_rows size: " << updated_rows.size() << " " - << sstream.str(); - } - - auto &origin_tensor = - scope_->FindVar(varname)->Get(); - auto *origin_tensor_data = origin_tensor.data(); - auto &dims = origin_tensor.dims(); - *outvar = scope->Var(); - auto *out_slr = (*outvar)->GetMutable(); - out_slr->set_rows(updated_rows); - out_slr->set_height(dims[0]); - auto out_dims = framework::make_ddim( - {static_cast(updated_rows.size()), dims[1]}); - auto *data = out_slr->mutable_value()->mutable_data( - out_dims, origin_tensor.place()); - auto width = dims[1]; - for (size_t i = 0; i < updated_rows.size(); ++i) { - PADDLE_ENFORCE_LT( - updated_rows[i], dims[0], - platform::errors::OutOfRange( - "The value of updated_rows: %s out of Tensor %s dims[0]: %s", - updated_rows[i], varname, dims[0])); - memcpy(data + i * width, origin_tensor_data + updated_rows[i] * width, - sizeof(float) * width); - } - } else { - *outvar = scope_->FindVar(varname); - } - } - } - return true; -} - -bool RequestGetNoBarrierHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(4) << "RequestGetNoBarrierHandler:" << varname - << " out_var_name: " << out_var_name; - - // get var from pserver immediately without barriers - string::Piece without_barrier_piece(WITHOUT_BARRIER_MESSAGE); - string::Piece var_name_piece = string::Piece(varname); - - if (string::Contains(var_name_piece, without_barrier_piece)) { - var_name_piece = string::TrimSuffix(var_name_piece, without_barrier_piece); - VLOG(4) << "Get var " << var_name_piece << " with " - << WITHOUT_BARRIER_MESSAGE; - *outvar = scope_->FindVar(var_name_piece.ToString()); - return true; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "GetNoBarrier must contain %s", WITHOUT_BARRIER_MESSAGE)); - } - return true; -} - -bool RequestPrefetchHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(4) << "RequestPrefetchHandler " << varname; - - (*outvar)->GetMutable(); - - VLOG(1) << "Prefetch " - << "tablename: " << table_name << " ids:" << varname - << " out: " << out_var_name; - paddle::platform::CPUPlace cpu_place; - auto *ins = distributed::LargeScaleKV::GetInstance(); - - if (ins->ParamInLargeScale(table_name)) { - auto lookup_table_op = PullLargeScaleOp(table_name, varname, out_var_name); - lookup_table_op->Run(*scope, cpu_place); - } else { - auto lookup_table_op = - BuildLookupTableOp(table_name, varname, out_var_name); - lookup_table_op->Run(*scope, cpu_place); - } - - return true; -} - -bool RequestCheckpointHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(4) << "receive save var " << varname << " with path " << out_var_name - << " mode " << table_name; - - int mode = std::stoi(table_name); - - auto *ins = distributed::LargeScaleKV::GetInstance(); - ins->Get(varname)->Save(out_var_name, mode); - return true; -} - -bool RequestNotifyHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(3) << "RequestNotifyHandler: " << varname - << ", trainer_id: " << trainer_id; - - string::Piece decay_piece(STEP_COUNTER); - string::Piece var_name_piece = string::Piece(varname); - if (string::Contains(var_name_piece, decay_piece)) { - VLOG(3) << "LearningRate Decay Counter Update"; - - auto *send_var = scope->FindVar(varname); - auto send_var_tensor = send_var->Get(); - auto *send_value = - send_var_tensor.mutable_data(send_var_tensor.place()); - - auto counter = decay_counters.at(trainer_id); - counter += send_value[0]; - decay_counters.at(trainer_id) = counter; - - auto *global_step_var = this->scope()->FindVar(LEARNING_RATE_DECAY_COUNTER); - if (global_step_var == nullptr) { - PADDLE_THROW(platform::errors::InvalidArgument( - "can not find LEARNING_RATE_DECAY_COUNTER ")); - } - - auto *tensor = global_step_var->GetMutable(); - auto *value = tensor->mutable_data(platform::CPUPlace()); - - auto global_counter = 0; - for (auto &trainer_counter : decay_counters) { - global_counter += trainer_counter.second; - } - value[0] = global_counter; - - if (lr_decay_prepared_ctx_.get() == nullptr) { - PADDLE_THROW(platform::errors::InvalidArgument( - "can not find decay block for executor")); - } - - executor_->RunPreparedContext(lr_decay_prepared_ctx_.get(), scope_); - } - return true; -} - -bool RequestSendAndRecvHandler::Handle(const std::string &varname, - framework::Scope *Scope, - framework::Variable *var, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(3) << "SendAndRecvHandle: " << varname - << " out_var_name: " << out_var_name - << " , trainer_id: " << trainer_id; - - executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), Scope); - *outvar = Scope->FindVar(out_var_name); - return true; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h deleted file mode 100644 index 6d239673f9104131c3129ea822e5c9f892845ea1..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/request_handler_impl.h +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/request_handler.h" - -namespace paddle { -namespace framework { -class Scope; -class Variable; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class RequestSendHandler final : public RequestHandler { - public: - explicit RequestSendHandler(int distributed_mode, bool enable_dc_asgd = false) - : RequestHandler(distributed_mode) { - enable_dc_asgd_ = enable_dc_asgd; - } - virtual ~RequestSendHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; - - private: - bool enable_dc_asgd_; -}; - -class RequestGetHandler final : public RequestHandler { - public: - explicit RequestGetHandler(int distributed_mode, bool enable_dc_asgd = false) - : RequestHandler(distributed_mode) { - enable_dc_asgd_ = enable_dc_asgd; - } - virtual ~RequestGetHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; - - private: - bool enable_dc_asgd_; -}; - -class RequestGetNoBarrierHandler final : public RequestHandler { - public: - RequestGetNoBarrierHandler() : RequestHandler(false) {} - virtual ~RequestGetNoBarrierHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; -}; - -static inline void BuildVar(const std::string& param_name, - std::initializer_list arguments, - paddle::framework::proto::OpDesc::Var* var) { - var->set_parameter(param_name); - for (auto& arg_name : arguments) { - *var->mutable_arguments()->Add() = arg_name; - } -} - -class RequestPrefetchHandler final : public RequestHandler { - public: - explicit RequestPrefetchHandler(int distributed_mode) - : RequestHandler(distributed_mode) {} - virtual ~RequestPrefetchHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; - - private: - std::unique_ptr PullLargeScaleOp( - const std::string& table_name, const std::string& id_name, - const std::string& out_name) { - framework::OpDesc desc; - desc.SetType("lookup_sparse_table_read"); - desc.SetInput("Ids", {id_name}); - desc.SetOutput("Out", std::vector({out_name})); - desc.SetAttr("tablename", {table_name}); - desc.SetAttr("init", true); - desc.SetAttr("value_names", std::vector({"Param"})); - - auto op = paddle::framework::OpRegistry::CreateOp(desc); - return op; - } - - std::unique_ptr BuildLookupTableOp( - const std::string& table_name, const std::string& id_name, - const std::string& out_name) { - paddle::framework::proto::OpDesc op_desc; - op_desc.set_type("lookup_table"); - BuildVar("W", {table_name.data()}, op_desc.add_inputs()); - BuildVar("Ids", {id_name.data()}, op_desc.add_inputs()); - BuildVar("Out", {out_name.data()}, op_desc.add_outputs()); - - auto op = paddle::framework::OpRegistry::CreateOp(op_desc); - return op; - } -}; - -class RequestCheckpointHandler final : public RequestHandler { - public: - explicit RequestCheckpointHandler(int distributed_mode) - : RequestHandler(distributed_mode) {} - - virtual ~RequestCheckpointHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; - - private: - std::unique_ptr BuildCheckpointOp( - const std::string& varname, const std::string& file_path) { - paddle::framework::proto::OpDesc op_desc; - op_desc.set_type("save"); - BuildVar("X", {varname.data()}, op_desc.add_inputs()); - - auto attr = op_desc.mutable_attrs()->Add(); - attr->set_name("file_path"); - attr->set_type(paddle::framework::proto::AttrType::STRING); - attr->set_s(file_path); - - auto op = paddle::framework::OpRegistry::CreateOp(op_desc); - return op; - } -}; - -class RequestNotifyHandler final : public RequestHandler { - public: - explicit RequestNotifyHandler(int distributed_mode, int trainers) - : RequestHandler(distributed_mode) { - this->trainers = trainers; - for (int i = 0; i < trainers; i++) { - decay_counters[i] = 0; - } - } - virtual ~RequestNotifyHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; - - private: - int trainers; - std::unordered_map decay_counters; -}; - -class RequestSendAndRecvHandler final : public RequestHandler { - public: - explicit RequestSendAndRecvHandler(int distributed_mode) - : RequestHandler(distributed_mode) {} - virtual ~RequestSendAndRecvHandler() {} - bool Handle(const std::string& varname, framework::Scope* Scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_client.cc b/paddle/fluid/operators/distributed/rpc_client.cc deleted file mode 100644 index 57ce54870decf2d56c321efbaddbc108fb113ea7..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/rpc_client.cc +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/rpc_client.h" -#include "gflags/gflags.h" - -// default to 3min to avoid temprary network failures. -DEFINE_int32(rpc_deadline, 180000, "deadline timeouts for rpc"); -DEFINE_int32(rpc_retry_times, 3, "retry times for rpc"); - -namespace paddle { -namespace operators { -namespace distributed { - -std::once_flag RPCClient::init_flag_; -std::unique_ptr RPCClient::rpc_client_(nullptr); -int RPCClient::trainer_id_ = 0; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h deleted file mode 100644 index 2c756a6f71ff94e11be634f69d4a2f8e9174d716..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/rpc_client.h +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include // NOLINT -#include -#include - -#include "gflags/gflags.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/operators/distributed/request_handler.h" - -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -DECLARE_int32(rpc_deadline); -DECLARE_int32(rpc_retry_times); - -namespace paddle { -namespace operators { -namespace distributed { - -class RPCClient { - public: - RPCClient() {} - virtual ~RPCClient() {} - virtual VarHandlePtr AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_varname, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncGetVarNoBarrier( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - const std::string& out_varname, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncGetMonomerVariable( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncPrefetchVar( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& in_var_name, - const std::string& out_var_name, const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncSendBatchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncSendFetchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncGetMonomerBarrier( - const std::string& ep, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncCheckpointNotify( - const std::string& ep, const std::string& dirname, - const std::string& varname, const int mode, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncDistributeNotify( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncSendAndRecv( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& send_var_name, - const std::string& recv_var_name, const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncSendComplete( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; - - // Complete tells all the pserver instances that finishe the training, - // the pserver can reduce it's barrier count, and continue to train - // with other trainers. - virtual void SendComplete() = 0; - - virtual bool Wait() = 0; - - template - static RPCClient* GetInstance(int trainer_id) { - std::call_once(init_flag_, &RPCClient::Init, trainer_id); - return rpc_client_.get(); - } - - // Init is called by GetInstance. - template - static void Init(int trainer_id) { - VLOG(1) << "init rpc client with trainer_id " << trainer_id; - trainer_id_ = trainer_id; - if (rpc_client_.get() == nullptr) { - rpc_client_.reset(new T()); - rpc_client_->InitImpl(); - } - } - - virtual void InitImpl() {} - - protected: - // each trainer have exact one trainer id, it should be static - static int trainer_id_; - - private: - static std::once_flag init_flag_; - static std::unique_ptr rpc_client_; -}; -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc deleted file mode 100644 index 37cf0460fb1fa11cb2a7de428463c20c145cb1b5..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/rpc_server.h" - -#include -#include - -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class RequestHandler; - -void RPCServer::ShutDown() { - VLOG(3) << "RPCServer ShutDown "; - ShutDownImpl(); - - exit_flag_ = true; - barrier_cond_.notify_all(); - rpc_cond_.notify_all(); -} - -void RPCServer::SavePort() const { - auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid()); - std::ofstream port_file; - port_file.open(file_path); - port_file << selected_port_; - port_file.close(); - VLOG(3) << "selected port written to " << file_path; -} - -void RPCServer::WaitBarrier(const std::string& rpc_name) { - VLOG(3) << "WaitBarrier in: " << rpc_name; - std::unique_lock lock(this->mutex_); - barrier_cond_.wait(lock, [this, &rpc_name] { - return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) || - exit_flag_.load()); - }); - - VLOG(3) << "WaitBarrier out: " << rpc_name - << " counter: " << barrier_counter_[rpc_name]; -} - -void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { - VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; - // barrier msg should make sure that it's in the right cond(send|recv) - WaitCond(rpc_name); - int b = 0; - std::unique_lock lock(mutex_); - b = ++barrier_counter_[rpc_name]; - VLOG(3) << rpc_name << " barrier_counter: " << b; - if (b >= client_num_) { - lock.unlock(); - VLOG(3) << "BatchBarrier counter reach " << client_num_ << " for " - << rpc_name; - barrier_cond_.notify_all(); - lock.lock(); - } -} - -void RPCServer::Complete() { - { - std::unique_lock lock(mutex_); - client_num_--; - need_reset_all_vars_ = true; - - VLOG(3) << "decrease client_num to: " << client_num_; - if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) { - barrier_counter_[kRequestGet]--; - } - } - barrier_cond_.notify_all(); -} - -bool RPCServer::NeedResetAllVars() { - std::unique_lock lock(mutex_); - return need_reset_all_vars_; -} - -int RPCServer::GetClientNum() { - std::unique_lock lock(mutex_); - return client_num_; -} - -void RPCServer::ResetBarrierCounter() { - VLOG(3) << "RPCServer ResetBarrierCounter "; - std::unique_lock lock(mutex_); - for (auto& t : barrier_counter_) { - t.second = 0; - } - need_reset_all_vars_ = false; -} - -void RPCServer::RegisterRPC(const std::string& rpc_name, - RequestHandler* handler, int thread_num) { - rpc_call_map_[rpc_name] = handler; - rpc_thread_num_[rpc_name] = thread_num; - - static int cond = -1; - rpc_cond_map_[rpc_name] = ++cond; - VLOG(3) << "RegisterRPC rpc_name: " << rpc_name << ", handler: " << handler - << ", cond: " << rpc_cond_map_[rpc_name]; -} - -void RPCServer::SetCond(const std::string& rpc_name) { - VLOG(3) << "RPCServer SetCond " << rpc_name; - { - std::unique_lock lock(mutex_); - cur_cond_ = rpc_cond_map_[rpc_name]; - } - - rpc_cond_.notify_all(); -} - -void RPCServer::WaitCond(const std::string& rpc_name) { - VLOG(3) << "RPCServer WaitCond in " << rpc_name; - int cond = 0; - { - std::unique_lock lock(mutex_); - cond = rpc_cond_map_[rpc_name]; - } - - std::unique_lock lock(mutex_); - rpc_cond_.wait( - lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); }); - VLOG(3) << "RPCServer WaitCond out " << rpc_name; -} - -void RPCServer::RegisterVar(const std::string& var_name, - const std::string& rpc_name, - framework::Scope* scope, - platform::DeviceContext* dev_ctx) { - MonomerHandle h; - h.var_name_ = var_name; - h.rpc_name_ = rpc_name; - h.scope_ = scope; - h.dev_ctx_ = dev_ctx; - - { - std::unique_lock lock(mutex_); - PADDLE_ENFORCE_EQ( - var_map_.find(var_name), var_map_.end(), - platform::errors::AlreadyExists("%s already in var_map.", var_name)); - var_map_[var_name] = h; - } - - rpc_cond_.notify_all(); - VLOG(3) << "RegisterVar context:" << h.String(); -} - -void RPCServer::IncreaseVarBarrier(const std::string& var_name) { - int b = 0; - MonomerHandle h; - { - std::unique_lock lock(mutex_); - b = ++var_map_[var_name].barrier_; - h = var_map_[var_name]; - } - - if (b >= client_num_) { - barrier_cond_.notify_all(); - } - - VLOG(3) << "IncreaseVarBarrier context:" << h.String(); -} - -void RPCServer::WaitVarBarrier(const std::string& var_name) { - VLOG(3) << "WaitVarBarrier var_name:" << var_name; - - std::unique_lock lock(mutex_); - barrier_cond_.wait(lock, [&]() { - return ((var_map_[var_name].barrier_ >= client_num_ && client_num_ != 0) || - exit_flag_.load()); - }); - - VLOG(3) << "WaitVarBarrier context: " << var_map_[var_name].String(); -} - -void RPCServer::SetVarCond(const std::string& var_name) { - VLOG(3) << "SetVarCond var_name:" << var_name; - { - std::unique_lock lock(mutex_); - if (var_map_.find(var_name) != var_map_.end()) { - rpc_cond_.notify_all(); - } - } -} - -void RPCServer::WaitVarCond(const std::string& var_name) { - VLOG(3) << "WaitVarCond var_name:" << var_name; - - std::unique_lock lock(mutex_); - rpc_cond_.wait(lock, [=] { - return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load()); - }); - - VLOG(3) << "WaitVarCond var_name:" << var_name << " end"; -} - -MonomerHandle RPCServer::GetMonomer(const std::string& var_name) { - MonomerHandle h; - { - std::unique_lock lock(mutex_); - h = var_map_[var_name]; - } - - return h; -} - -void RPCServer::ClearRegisteredVars() { - std::unique_lock lock(mutex_); - var_map_.clear(); -} - -void RPCServer::ClearVar(const std::string& var_name) { - std::unique_lock lock(mutex_); - var_map_.erase(var_name); -} -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h deleted file mode 100644 index 2120260515e2556046358e62592214c9f648533b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/rpc_server.h +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include // NOLINT -#include -#include -#include - -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/platform/device_context.h" - -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class RequestHandler; - -struct MonomerHandle { - std::string var_name_; - std::string rpc_name_; - framework::Scope* scope_{nullptr}; - platform::DeviceContext* dev_ctx_{nullptr}; - int64_t barrier_{0}; - - std::string String() { - std::stringstream ss; - ss << "var_name:" << var_name_ << ", rpc_name:" << rpc_name_ - << ", scope:" << scope_ << ", dev_ctx:" << dev_ctx_ - << ", barrier_:" << barrier_; - return ss.str(); - } -}; - -class RPCServer { - public: - explicit RPCServer(const std::string& address, int client_num) - : cur_cond_(0), - bind_address_(address), - exit_flag_(false), - selected_port_(0), - client_num_(client_num), - need_reset_all_vars_(false) {} - - virtual ~RPCServer() {} - virtual void StartServer() = 0; - virtual void WaitServerReady() = 0; - - void ShutDown(); - - bool IsExit() { return exit_flag_.load(); } - - int GetSelectedPort() const { return selected_port_; } - - int GetClientNum(); - - void SavePort() const; - - // RegisterRPC, register the rpc method name to a handler - // class, and auto generate a condition id for this call - // to be used for the barrier. - void RegisterRPC(const std::string& rpc_name, RequestHandler* handler, - int thread_num = 1); - - int GetThreadNum(const std::string& rpc_name) { - return rpc_thread_num_[rpc_name]; - } - - // Wait util all the clients have reached the barrier for one - // rpc method. This function should be called in the - // RequestHandler if you want to run the server/client in a - // synchronous mode. - void WaitBarrier(const std::string& rpc_name); - - void SetCond(const std::string& rpc_name); - void WaitCond(const std::string& rpc_name); - void IncreaseBatchBarrier(const std::string rpc_name); - - void RegisterVar(const std::string& var_name, const std::string& rpc_name, - framework::Scope* scope, platform::DeviceContext* dev_ctx); - void IncreaseVarBarrier(const std::string& var_name); - void WaitVarBarrier(const std::string& var_name); - void SetVarCond(const std::string& var_name); - void WaitVarCond(const std::string& var_name); - void ClearRegisteredVars(); - void ClearVar(const std::string& var_name); - MonomerHandle GetMonomer(const std::string& var_name); - - void Complete(); - - void ResetBarrierCounter(); - - bool NeedResetAllVars(); - - protected: - virtual void ShutDownImpl() = 0; - - private: - std::mutex mutex_; - std::unordered_map barrier_counter_; - std::condition_variable barrier_cond_; - - std::unordered_map rpc_cond_map_; - std::atomic cur_cond_; - std::condition_variable rpc_cond_; - - protected: - std::string bind_address_; - std::atomic exit_flag_; - int selected_port_; - int client_num_; - bool need_reset_all_vars_; - - std::unordered_map rpc_call_map_; - std::unordered_map rpc_thread_num_; - friend class RequestHandler; - - // TODO(gongwb): use more cond to notify or wait; - std::unordered_map var_map_; -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc deleted file mode 100644 index f59285400033df2726fe519fe76dd63bd8c504d5..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/rpc_server_test.cc +++ /dev/null @@ -1,344 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include // NOLINT -#include -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" - -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/heart_beat_monitor.h" -#include "paddle/fluid/operators/distributed/large_scale_kv.h" -#include "paddle/fluid/operators/distributed/request_handler_impl.h" -#include "paddle/fluid/operators/distributed/rpc_client.h" -#include "paddle/fluid/operators/distributed/rpc_server.h" - -namespace framework = paddle::framework; -namespace platform = paddle::platform; -namespace distributed = paddle::operators::distributed; - -USE_NO_KERNEL_OP(lookup_sparse_table_read); -USE_NO_KERNEL_OP(checkpoint_notify); -USE_OP(scale); - -std::unique_ptr g_rpc_service; -std::unique_ptr g_req_handler; - -framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) { - auto root_block = program->MutableBlock(0); - auto* block = program->AppendBlock(*root_block); - - framework::OpDesc* op = block->AppendOp(); - op->SetType("scale"); - op->SetInput("X", {"x"}); - op->SetOutput("Out", {"res"}); - op->SetAttr("scale", 0.5f); - - auto& out = *root_block->Var("res"); - out.SetType(framework::proto::VarType::LOD_TENSOR); - out.SetShape({1, 10}); - - return block; -} - -void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) { - auto w_var = scope->Var("w"); - w_var->GetMutable(); - - auto out_var = scope->Var("out"); - out_var->GetMutable(); - - auto ids_var = scope->Var("ids"); - ids_var->GetMutable(); - - auto x_var = scope->Var("x"); - x_var->GetMutable(); - - auto res_var = scope->Var("res"); - res_var->GetMutable(); -} - -void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place, - int64_t rows_numel) { - CreateVarsOnScope(scope, place); - auto ids_var = scope->Var("ids")->GetMutable(); - int64_t* ids_ptr = - ids_var->mutable_data(framework::DDim({rows_numel, 1}), *place); - for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2; - - auto x_var = scope->Var("x")->GetMutable(); - float* x_ptr = - x_var->mutable_data(framework::DDim({1, rows_numel}), *place); - for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0; -} - -void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, - int64_t rows_numel) { - CreateVarsOnScope(scope, place); - auto w = scope->Var("w")->GetMutable(); - auto w_value = w->mutable_value(); - w_value->Resize({rows_numel, 10}); - for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true); - - auto ptr = w_value->mutable_data(*place); - - for (int64_t i = 0; i < w_value->numel(); ++i) { - ptr[i] = static_cast(i / 10); - } -} - -void StartServer(const std::string& rpc_name) { - framework::ProgramDesc program; - framework::Scope scope; - platform::CPUPlace place; - framework::Executor exe(place); - platform::CPUDeviceContext ctx(place); - - std::unordered_map> - prefetch_var_name_to_prepared; - - g_req_handler->SetProgram(&program); - g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared); - g_req_handler->SetDevCtx(&ctx); - g_req_handler->SetScope(&scope); - g_req_handler->SetExecutor(&exe); - - g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get()); - - // distributed::HeartBeatMonitor::Init(1, true, "w@grad"); - - g_req_handler->SetRPCServer(g_rpc_service.get()); - - std::thread server_thread( - std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get())); - - server_thread.join(); -} - -void StartSendAndRecvServer(const std::string& rpc_name) { - framework::ProgramDesc program; - framework::Scope scope; - platform::CPUPlace place; - framework::Executor exe(place); - platform::CPUDeviceContext ctx(place); - auto block = AppendSendAndRecvBlock(&program); - std::string in_var_name("x"); - std::vector prefetch_block_ids{block->ID()}; - auto prepared = exe.Prepare(program, prefetch_block_ids); - InitTensorsOnServer(&scope, &place, 10); - - std::unordered_map> - grad_to_prepared_ctx; - grad_to_prepared_ctx[in_var_name] = prepared[0]; - - g_req_handler->SetProgram(&program); - g_req_handler->SetGradToPreparedCtx(&grad_to_prepared_ctx); - g_req_handler->SetDevCtx(&ctx); - g_req_handler->SetScope(&scope); - g_req_handler->SetExecutor(&exe); - - g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get()); - g_req_handler->SetRPCServer(g_rpc_service.get()); - - std::thread server_thread( - std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get())); - - server_thread.join(); -} - -TEST(COMPLETE, CPU) { - setenv("http_proxy", "", 1); - setenv("https_proxy", "", 1); - g_req_handler.reset( - new distributed::RequestSendHandler(distributed::DistributedMode::kSync)); - g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 2)); - distributed::RPCClient* client = - distributed::RPCClient::GetInstance(0); - PADDLE_ENFORCE_NE(client, nullptr, - platform::errors::InvalidArgument( - "Client Start Fail, Check Your Code & Env")); - std::thread server_thread(StartServer, distributed::kRequestSend); - g_rpc_service->WaitServerReady(); - int port = g_rpc_service->GetSelectedPort(); - std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port); - client->AsyncSendComplete(ep); - client->Wait(); - - EXPECT_EQ(g_rpc_service->GetClientNum(), 1); - - g_rpc_service->ShutDown(); - server_thread.join(); - g_rpc_service.reset(nullptr); - g_req_handler.reset(nullptr); -} - -TEST(SENDANDRECV, CPU) { - setenv("http_proxy", "", 1); - setenv("https_proxy", "", 1); - g_req_handler.reset(new distributed::RequestSendAndRecvHandler( - distributed::DistributedMode::kAsync)); - g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); - distributed::RPCClient* client = - distributed::RPCClient::GetInstance(0); - PADDLE_ENFORCE_NE(client, nullptr, - platform::errors::InvalidArgument( - "Client Start Fail, Check Your Code & Env")); - std::thread server_thread(StartSendAndRecvServer, - distributed::kRequestSendAndRecv); - g_rpc_service->WaitServerReady(); - int port = g_rpc_service->GetSelectedPort(); - std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port); - - framework::Scope scope; - platform::CPUPlace place; - platform::CPUDeviceContext ctx(place); - - // create var on local scope - int64_t rows_numel = 10; - InitTensorsOnClient(&scope, &place, rows_numel); - std::string in_var_name("x"); - std::string out_var_name("res"); - - client->AsyncSendAndRecv(ep, ctx, scope, in_var_name, out_var_name); - client->Wait(); - auto var = scope.Var(out_var_name); - auto value = var->GetMutable(); - auto ptr = value->mutable_data(place); - - for (int64_t i = 0; i < rows_numel; ++i) { - EXPECT_EQ(ptr[i], 0.5); - } - g_rpc_service->ShutDown(); - server_thread.join(); - LOG(INFO) << "begin reset"; - g_rpc_service.reset(nullptr); - g_req_handler.reset(nullptr); -} - -void StartCheckpointServer(const std::string& rpc_name) { - framework::ProgramDesc program; - framework::Scope scope; - platform::CPUPlace place; - framework::Executor exe(place); - platform::CPUDeviceContext ctx(place); - - std::vector metas; - - auto meta = distributed::SparseMeta(); - meta.name = "embedding.block0"; - meta.value_names = {"Param"}; - meta.value_dims = {64}; - meta.mode = distributed::Mode::training; - meta.grad_name = "embedding@Grad"; - meta.cached_varnames = {"kSparseIds"}; - meta.initializer_attrs = {"fill_constant&1.0"}; - meta.entry = "none"; - - metas.push_back(meta); - distributed::LargeScaleKV::Init(metas); - - auto* ins = distributed::LargeScaleKV::GetInstance(); - ins->Get("embedding.block0")->Init({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); - - std::unordered_map> - prefetch_var_name_to_prepared; - - g_req_handler->SetProgram(&program); - g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared); - g_req_handler->SetDevCtx(&ctx); - g_req_handler->SetScope(&scope); - g_req_handler->SetExecutor(&exe); - - g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get()); - - g_req_handler->SetRPCServer(g_rpc_service.get()); - - std::thread server_thread( - std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get())); - - server_thread.join(); -} - -TEST(LARGE_SCALE_CHECKPOINT, CPU) { - setenv("http_proxy", "", 1); - setenv("https_proxy", "", 1); - - paddle::framework::Scope scope; - paddle::platform::CPUPlace place; - - g_req_handler.reset(new distributed::RequestCheckpointHandler( - distributed::DistributedMode::kAsync)); - g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); - - distributed::RPCClient* client = - distributed::RPCClient::GetInstance(0); - - PADDLE_ENFORCE_NE(client, nullptr, - platform::errors::InvalidArgument( - "Client Start Fail, Check Your Code & Env")); - - std::thread server_thread(StartCheckpointServer, - distributed::kRequestCheckpoint); - g_rpc_service->WaitServerReady(); - - int port = g_rpc_service->GetSelectedPort(); - std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port); - - auto save_path = - paddle::string::Sprintf("%s/%s/%s", "/tmp/large_scale_table/base", - "embedding", "embedding.block0"); - int mode = 0; - client->AsyncCheckpointNotify(ep, save_path, "embedding.block0", mode); - client->Wait(); - - save_path = - paddle::string::Sprintf("%s/%s/%s", "/tmp/large_scale_table/delta", - "embedding", "embedding.block0"); - mode = 1; - client->AsyncCheckpointNotify(ep, save_path, "embedding.block0", mode); - client->Wait(); - - paddle::framework::AttributeMap attrs; - - std::vector eps = {ep}; - attrs["endpoints"] = eps; - attrs["dirname"] = std::string("/tmp/large_scale_table/delta1"); - attrs["varname"] = std::string("embedding"); - attrs["mode"] = 2; - std::vector slices = {"embedding.block0"}; - attrs["slice_varnames"] = slices; - std::vector remotes = {"embedding.block0"}; - attrs["remote_varnames"] = remotes; - - auto ops = - framework::OpRegistry::CreateOp("checkpoint_notify", {}, {}, attrs, true); - ops->Run(scope, place); - - g_rpc_service->ShutDown(); - server_thread.join(); - LOG(INFO) << "begin reset"; - g_rpc_service.reset(nullptr); - g_req_handler.reset(nullptr); -} diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in deleted file mode 100644 index a333642bd16fbfbe648a835101d67218bf473cdb..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/send_recv.proto.in +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under -the Apache License, Version 2.0 (the "License"); you may not use this file -except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -syntax = "proto3"; -package sendrecv; - -option cc_generic_services = @cc_generic_services@; - -service SendRecvService { - // For parameter server round-robin like hashing, do not split tensors. - // Send and recv only one tensor - // TODO(typhoonzero): add streaming API - rpc SendVariable(VariableMessage) returns (VoidMessage) {} - // Argument VariableMessage for GetVariable should only contain varname. - rpc GetVariable(VariableMessage) returns (VariableMessage) {} - rpc GetVariableNoBarrier(VariableMessage) returns (VariableMessage) {} - // pre-fetch variable by given variable name and Ids - rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} - - rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {} - rpc DistributeNotify(VariableMessage) returns (VoidMessage) {} - rpc SendAndRecvVariable(VariableMessage) returns (VariableMessage) {} - rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {} - rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {} -} - -// It can be: LoDTensor、SelectedRows or NCCL_ID -enum VarType { - LOD_TENSOR = 0; - SELECTED_ROWS = 1; - NCCL_ID = 2; -} - -// VariableMessage is serialized paddle variable message. -// NOTICE(gongwb):don't modify this proto if you are not -// not familar with how we serialize in sendrecvop_utils.h -// and deserilize it in variable_response.h. -message VariableMessage { - enum Type { - // Pod Types - BOOL = 0; - INT16 = 1; - INT32 = 2; - INT64 = 3; - FP16 = 4; - FP32 = 5; - FP64 = 6; - } - - message LodData { repeated int64 lod_data = 1; } - string varname = 1; - // TODO(Yancey1989): reference framework::proto::VarDesc::VarType - VarType type = 2; - // bool persistable is not needed for sending. - // tensor info: - Type data_type = 3; - repeated int64 dims = 4; - - // lod details: - int64 lod_level = 5; - repeated LodData lod = 6; - // selected_rows height, aka. original dim0 - int64 slr_height = 7; - // tensor data - bytes serialized = 8; - // selected_rows data - bytes rows = 9; - // Look up table block execution output variable name. - string out_varname = 10; - // If 1, the ps server will start profiling, the ps - // server stops profiling and generates a profile to /tmp/profile_ps_* - // when profile switches from 1 to 2. - int64 profile = 11; - int64 trainer_id = 12; - string table_name = 13; -} - -message VoidMessage {} diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc deleted file mode 100644 index 107c74eb2670e4c83184167e7758119e95e72cf9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc +++ /dev/null @@ -1,117 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include - -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" - -namespace paddle { -namespace framework { -class Variable; -} // namespace framework -namespace memory { -namespace allocation { -class Allocation; -} // namespace allocation -} // namespace memory -} // namespace paddle - -DEFINE_bool(rpc_disable_reuse_port, false, "Disable SO_REUSEPORT or not."); -DEFINE_int32(rpc_retry_bind_port, 3, - "Retry to bind the address if address is already used."); - -namespace paddle { -namespace operators { -namespace distributed { - -using VarMsg = sendrecv::VariableMessage; - -static TensorPayload GetCommunicationAllocationFromTensor( - const platform::DeviceContext& ctx, const framework::Tensor& tensor) { - if (is_gpu_place(ctx.GetPlace())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - PADDLE_ENFORCE_EQ( - is_gpu_place(tensor.place()), true, - platform::errors::PreconditionNotMet("Please run in gpu place.")); - auto& gpu_dev_ctx = - reinterpret_cast(ctx); - auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type()); - platform::CUDAPinnedPlace cuda_pinned; - auto result = memory::AllocShared(cuda_pinned, copy_size); - - memory::Copy(cuda_pinned, result->ptr(), - BOOST_GET_CONST(platform::CUDAPlace, tensor.place()), - tensor.data(), copy_size, gpu_dev_ctx.stream()); - ctx.Wait(); - return TensorPayload(result); -#else - PADDLE_THROW( - platform::errors::Unavailable("This situation should not be happened")); -#endif - } else { - return TensorPayload(tensor); - } -} -TensorPayload GetTensorPayload(framework::Variable* var, - const platform::DeviceContext& ctx, - VarMsg* request) { - auto tensor = var->Get(); - // FIXME(wuyi): data types in send_recv.proto is copied from - // framework.proto - request->set_data_type(static_cast(tensor.type())); - for (auto& dim : framework::vectorize(tensor.dims())) { - request->add_dims(dim); - } - const framework::LoD lod = tensor.lod(); - if (lod.size() > 0) { - request->set_lod_level(lod.size()); - for (auto& each : lod) { - VarMsg::LodData* lod_inner = request->add_lod(); - for (auto& d : each) { - lod_inner->add_lod_data(d); - } - } - } - return GetCommunicationAllocationFromTensor(ctx, tensor); -} - -TensorPayload GetSelectedRowsPayload(framework::Variable* var, - const platform::DeviceContext& ctx, - VarMsg* request) { - auto* slr = var->GetMutable(); - request->set_data_type(static_cast(slr->value().type())); - request->set_lod_level(0); - request->set_slr_height(slr->height()); - - for (auto& dim : framework::vectorize(slr->value().dims())) { - request->add_dims(dim); - } - - auto* tensor = slr->mutable_value(); - return GetCommunicationAllocationFromTensor(ctx, *tensor); -} - -TensorPayload::TensorPayload(std::shared_ptr allocation) - : allocation_(allocation), offset_(0), memory_size_(allocation->size()) {} -TensorPayload::TensorPayload(const framework::Tensor& tensor) - : allocation_(tensor.Holder()), - offset_(tensor.offset()), - memory_size_(tensor.numel() * framework::SizeOfType(tensor.type())) {} -void* TensorPayload::ptr() const { - return reinterpret_cast( - reinterpret_cast(allocation_->ptr()) + offset_); -} -size_t TensorPayload::memory_size() const { return memory_size_; } -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h deleted file mode 100644 index 84ed1ab0247124bfd494810400fea4d108acf164..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.h +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/platform/port.h" - -namespace paddle { -namespace framework { -class Tensor; -class Variable; -} // namespace framework -namespace memory { -namespace allocation { -class Allocation; -} // namespace allocation -} // namespace memory -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -using VarMsg = sendrecv::VariableMessage; - -class TensorPayload final { - public: - explicit TensorPayload(const framework::Tensor& tensor); - explicit TensorPayload(std::shared_ptr allocation); - - TensorPayload(const TensorPayload& o) = default; - TensorPayload& operator=(const TensorPayload& o) = default; - - void* ptr() const; - size_t memory_size() const; - - private: - std::shared_ptr allocation_; - size_t offset_; - size_t memory_size_; -}; - -inline void SerializeDestroyCallback(void* payload) { - if (payload != nullptr) { - auto* shared_payload = reinterpret_cast(payload); - delete shared_payload; - } -} - -TensorPayload GetTensorPayload(framework::Variable* var, - const platform::DeviceContext& ctx, - VarMsg* request); - -TensorPayload GetSelectedRowsPayload(framework::Variable* var, - const platform::DeviceContext& ctx, - VarMsg* request); - -inline framework::proto::VarType::Type ToVarType( - sendrecv::VariableMessage::Type type) { - switch (type) { - case sendrecv::VariableMessage::FP32: - return framework::proto::VarType::FP32; // NOLINT - case sendrecv::VariableMessage::FP64: - return framework::proto::VarType::FP64; // NOLINT - case sendrecv::VariableMessage::INT32: - return framework::proto::VarType::INT32; // NOLINT - case sendrecv::VariableMessage::INT64: - return framework::proto::VarType::INT64; // NOLINT - case sendrecv::VariableMessage::BOOL: - return framework::proto::VarType::BOOL; // NOLINT - default: - PADDLE_THROW( - platform::errors::InvalidArgument("Not support type id: %d.", type)); - } -} - -template