diff --git a/.gitignore b/.gitignore index cecd6fa91c754d0862d26a10833a83aa3ced819c..801790d0a472080af607e9fbcde0284902a4ead8 100644 --- a/.gitignore +++ b/.gitignore @@ -6,10 +6,14 @@ paddle/fluid/eager/api/generated/* paddle/fluid/op_use_default_grad_maker_DEV.spec paddle/fluid/op_use_default_grad_maker_PR.spec paddle/phi/api/backward/backward_api.h +paddle/phi/api/backward/sparse_bw_api.h paddle/phi/api/include/api.h +paddle/phi/api/include/sparse_api.h paddle/phi/api/lib/api.cc paddle/phi/api/lib/dygraph_api.* paddle/phi/api/lib/backward_api.cc +paddle/phi/api/lib/sparse_api.cc +paddle/phi/api/lib/sparse_bw_api.cc paddle/phi/extension.h paddle/phi/include/* paddle/phi/infermeta/generated.* @@ -49,6 +53,10 @@ tools/__pycache__ # This file is automatically generated. # TODO(zhiqiang) Move this file to build directory. paddle/infrt/dialect/pd_ops.td +paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td +paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td +tools/infrt/kernels.json +tools/infrt/kernel_signature.json paddle/infrt/dialect/pd_ops_info.h .lit_test_times.txt paddle/infrt/tests/dialect/Output diff --git a/CMakeLists.txt b/CMakeLists.txt index 5b499fb43ab996b1c1780c0276faad2c37a8808a..6988434996bcc4745726b34278eb6007fdf8605f 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,6 +53,7 @@ option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF) # to develop some acl related functionality on x86 option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND}) option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF) +option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF) # Note(zhouwei): It use option above, so put here include(init) include(generic) # simplify cmake module @@ -238,7 +239,8 @@ option(WITH_MIPS "Compile PaddlePaddle with mips support" OFF) option(WITH_MUSL "Compile with musl libc instead of gblic" OFF) option(WITH_UNITY_BUILD "Compile with UnityBuild mode" OFF) option(WITH_STRIP "Strip so files of Whl packages" OFF) -option(NEW_RELEASE_CUBIN "PaddlePaddle next-level release strategy for pypi cubin package" OFF) +option(NEW_RELEASE_PYPI "PaddlePaddle next-level release strategy for pypi cubin package" OFF) +option(NEW_RELEASE_ALL "PaddlePaddle next-level release strategy for all arches cubin package" OFF) option(NEW_RELEASE_JIT "PaddlePaddle next-level release strategy for backup jit package" OFF) option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU" OFF) option(WITH_POCKETFFT "Compile with pocketfft support" ON) diff --git a/README.md b/README.md index 7dc83aa695cef8ecf177dfc2c444888850342bdc..cdbf2d9f3bf9973fb6c7fe2365ea61f05ce998c1 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ English | [简体中文](./README_cn.md) Welcome to the PaddlePaddle GitHub. PaddlePaddle, as the only independent R&D deep learning platform in China, has been officially open-sourced to professional communities since 2016. It is an industrial platform with advanced technologies and rich features that cover core deep learning frameworks, basic model libraries, end-to-end development kits, tools & components as well as service platforms. -PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 2.3 million developers. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI. +PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4 million developers. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI. diff --git a/README_cn.md b/README_cn.md index 6b37cfd97b35729dd293452178646db8f1194ca3..3834ee148f940326a2b1e1a8d0fd63a1028b0c96 100644 --- a/README_cn.md +++ b/README_cn.md @@ -15,7 +15,7 @@ 欢迎来到 PaddlePaddle GitHub -飞桨(PaddlePaddle)以百度多年的深度学习技术研究和业务应用为基础,是中国首个自主研发、功能完备、 开源开放的产业级深度学习平台,集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体。目前,飞桨累计开发者265万,服务企业10万家,基于飞桨开源深度学习平台产生了34万个模型。飞桨助力开发者快速实现AI想法,快速上线AI业务。帮助越来越多的行业完成AI赋能,实现产业智能化升级。 +飞桨(PaddlePaddle)以百度多年的深度学习技术研究和业务应用为基础,是中国首个自主研发、功能完备、 开源开放的产业级深度学习平台,集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体。目前,飞桨累计开发者406万,服务企业15.7万家,基于飞桨开源深度学习平台产生了47.6万个模型。飞桨助力开发者快速实现AI想法,快速上线AI业务。帮助越来越多的行业完成AI赋能,实现产业智能化升级。 ## 安装 diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 34c079ba71cf8ff1789ef31b9abb71dc171edfe6..312a0305244684c88e8926d2a71db377b0dd6be1 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -6,16 +6,22 @@ if(WITH_NV_JETSON) add_definitions(-DWITH_NV_JETSON) set(paddle_known_gpu_archs "53 62 72") set(paddle_known_gpu_archs10 "53 62 72") -elseif(NEW_RELEASE_CUBIN) +elseif(NEW_RELEASE_ALL) + message("Using New Release Strategy - All Arches Packge") + add_definitions(-DNEW_RELEASE_ALL) + set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86") + set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75") + set(paddle_known_gpu_archs11 "35 50 52 60 61 70 75 80") +elseif(NEW_RELEASE_PYPI) message("Using New Release Strategy - Cubin Packge") - add_definitions(-DNEW_RELEASE_CUBIN) - set(paddle_known_gpu_archs "35 37 50 52 60 61 70 75 80 86") - set(paddle_known_gpu_archs10 "50 60 70 75") - set(paddle_known_gpu_archs11 "60 70 75 80") + add_definitions(-DNEW_RELEASE_PYPI) + set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86") + set(paddle_known_gpu_archs10 "") + set(paddle_known_gpu_archs11 "60 61 70 75 80") elseif(NEW_RELEASE_JIT) message("Using New Release Strategy - JIT Packge") add_definitions(-DNEW_RELEASE_JIT) - set(paddle_known_gpu_archs "35 37 50 52 60 61 70 75 80 86") + set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86") set(paddle_known_gpu_archs10 "35 50 60 70 75") set(paddle_known_gpu_archs11 "35 50 60 70 75 80") else() @@ -148,7 +154,7 @@ function(select_nvcc_arch_flags out_variable) # remove dots and convert to lists string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}") - string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}") + string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}") string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}") string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}") diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake index a7a9e85ffd7314ac7026fccdf45fae2fa3de09d3..9f6fd32ad986c4a5911b1d00dfb548fa3320c34d 100644 --- a/cmake/external/llvm.cmake +++ b/cmake/external/llvm.cmake @@ -100,8 +100,8 @@ endfunction() function(mlir_add_rewriter td_base) set(LLVM_TARGET_DEFINITIONS ${td_base}.td) mlir_tablegen(${td_base}.cpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass") - add_public_tablegen_target(${td_base}_IncGen) - add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen) + add_public_tablegen_target(MLIR${td_base}IncGen) + add_dependencies(mlir-headers MLIR${td_base}IncGen) endfunction() # Execute the mlir script with infrt-exec program. diff --git a/cmake/external/onnxruntime.cmake b/cmake/external/onnxruntime.cmake new file mode 100644 index 0000000000000000000000000000000000000000..2162f87812d130f19262955798f28e2c2adc4bac --- /dev/null +++ b/cmake/external/onnxruntime.cmake @@ -0,0 +1,94 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if (NOT WITH_ONNXRUNTIME) + return() +endif () + +if (WITH_ARM) + message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu") + return() +endif () + +INCLUDE(ExternalProject) + +add_definitions(-DPADDLE_WITH_ONNXRUNTIME) + +SET(ONNXRUNTIME_PROJECT "extern_onnxruntime") +SET(ONNXRUNTIME_PREFIX_DIR ${THIRD_PARTY_PATH}/onnxruntime) +SET(ONNXRUNTIME_SOURCE_DIR ${THIRD_PARTY_PATH}/onnxruntime/src/${ONNXRUNTIME_PROJECT}) +SET(ONNXRUNTIME_INSTALL_DIR ${THIRD_PARTY_PATH}/install/onnxruntime) +SET(ONNXRUNTIME_INC_DIR "${ONNXRUNTIME_INSTALL_DIR}/include" CACHE PATH "onnxruntime include directory." FORCE) +SET(ONNXRUNTIME_LIB_DIR "${ONNXRUNTIME_INSTALL_DIR}/lib" CACHE PATH "onnxruntime lib directory." FORCE) +SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${ONNXRUNTIME_LIB_DIR}") + + +if (WIN32) + SET(ONNXRUNTIME_URL "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-win-x64-1.10.0.zip") +elseif (APPLE) + SET(ONNXRUNTIME_URL "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-osx-x86_64-1.10.0.tgz") +else () + SET(ONNXRUNTIME_URL "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-linux-x64-1.10.0.tgz") +endif() + + +INCLUDE_DIRECTORIES(${ONNXRUNTIME_INC_DIR}) # For ONNXRUNTIME code to include internal headers. +if (WIN32) + SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME source library." FORCE) + SET(ONNXRUNTIME_SHARED_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME shared library." FORCE) + SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.lib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE) +elseif (APPLE) + SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME source library." FORCE) + SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE) + SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE) +else () + SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME source library." FORCE) + SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME static library." FORCE) + SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE) +endif () + +if (WIN32) + ExternalProject_Add( + ${ONNXRUNTIME_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + URL ${ONNXRUNTIME_URL} + PREFIX ${ONNXRUNTIME_PREFIX_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_SHARED_LIB} && + ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.lib ${ONNXRUNTIME_LIB} && + ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR} + BUILD_BYPRODUCTS ${ONNXRUNTIME_LIB} + ) +else () + ExternalProject_Add( + ${ONNXRUNTIME_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + URL ${ONNXRUNTIME_URL} + PREFIX ${ONNXRUNTIME_PREFIX_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_LIB} && + ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR} + BUILD_BYPRODUCTS ${ONNXRUNTIME_LIB} + ) +endif() + +ADD_LIBRARY(onnxruntime STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${ONNXRUNTIME_LIB}) +ADD_DEPENDENCIES(onnxruntime ${ONNXRUNTIME_PROJECT}) diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake new file mode 100644 index 0000000000000000000000000000000000000000..661c3675c84b27a7ed8210fec0cfeaa2c858487c --- /dev/null +++ b/cmake/external/paddle2onnx.cmake @@ -0,0 +1,96 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT WITH_ONNXRUNTIME) + return() +endif() + +if (WITH_ARM) + message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu") + return() +endif () + +INCLUDE(ExternalProject) + +SET(PADDLE2ONNX_PROJECT "extern_paddle2onnx") +SET(PADDLE2ONNX_PREFIX_DIR ${THIRD_PARTY_PATH}/paddle2onnx) +SET(PADDLE2ONNX_INSTALL_DIR ${THIRD_PARTY_PATH}/install/paddle2onnx) +SET(PADDLE2ONNX_INC_DIR "${PADDLE2ONNX_INSTALL_DIR}/include" CACHE PATH "paddle2onnx include directory." FORCE) +SET(PADDLE2ONNX_REPOSITORY ${GIT_URL}/PaddlePaddle/Paddle2ONNX.git) +SET(PADDLE2ONNX_TAG cpp) +SET(LIBDIR "lib") +SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}") + +INCLUDE_DIRECTORIES(${PADDLE2ONNX_INC_DIR}) # For PADDLE2ONNX code to include internal headers. +if(WIN32) + SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.lib" CACHE FILEPATH "paddle2onnx static library." FORCE) + SET(PADDLE2ONNX_SHARED_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.dll" CACHE FILEPATH "paddle2onnx shared library." FORCE) +elseif(APPLE) + SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.dylib" CACHE FILEPATH "PADDLE2ONNX library." FORCE) +else() + SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.so" CACHE FILEPATH "PADDLE2ONNX library." FORCE) +endif(WIN32) + + +# The protoc path is required to compile onnx. +string(REPLACE "/" ";" PROTOC_BIN_PATH ${PROTOBUF_PROTOC_EXECUTABLE}) +list(POP_BACK PROTOC_BIN_PATH) +list(JOIN PROTOC_BIN_PATH "/" PROTOC_BIN_PATH) + + +set(PADDLE2ONNX_OPTIONAL_ARGS + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH} + -DWITH_STATIC=OFF + -DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} +) + +if (WITH_PYTHON) + set(PADDLE2ONNX_OPTIONAL_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS} + -DPYTHON_EXECUTABLE:FILEPATH=${PYTHON_EXECUTABLE} + -DPYTHON_INCLUDE_DIR:PATH=${PYTHON_INCLUDE_DIR} + -DPYTHON_LIBRARY:FILEPATH=${PYTHON_LIBRARY} + ) +endif () + + +ExternalProject_Add( + ${PADDLE2ONNX_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + GIT_REPOSITORY ${PADDLE2ONNX_REPOSITORY} + GIT_TAG ${PADDLE2ONNX_TAG} + DEPENDS protobuf + PREFIX ${PADDLE2ONNX_PREFIX_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PADDLE2ONNX_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${PADDLE2ONNX_LIB} +) + +ADD_LIBRARY(paddle2onnx STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE2ONNX_LIB}) +ADD_DEPENDENCIES(paddle2onnx ${PADDLE2ONNX_PROJECT}) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index f7cb7716969f5ccaa97d1ad7964510376b86870a..58ff5f0d2b715d117018eb2ff3d5989c8beb0694 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -198,7 +198,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}") ENDIF() - if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) + + if(WITH_ONNXRUNTIME) + SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) + SET(PROTOBUF_TAG v3.18.0) + elseif(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) SET(PROTOBUF_TAG v3.8.0) elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11) @@ -248,7 +252,9 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ) ENDFUNCTION() -if(WITH_ASCEND OR WITH_ASCEND_CL) +if(WITH_ONNXRUNTIME) + SET(PROTOBUF_VERSION 3.18.0) +elseif(WITH_ASCEND OR WITH_ASCEND_CL) SET(PROTOBUF_VERSION 3.8.0) elseif(WITH_IPU) SET(PROTOBUF_VERSION 3.6.1) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 415c0fe9bef9eab89e670d8b3f6f7c330b316ed8..cfbe68eecbaca55c5a288aae2c985bbc33d37be2 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -36,7 +36,7 @@ ENDIF() if(NOT DEFINED XPU_BASE_URL) SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220219") + SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220307") else() SET(XPU_BASE_URL "${XPU_BASE_URL}") endif() diff --git a/cmake/generic.cmake b/cmake/generic.cmake index f7c17bd7cfe7e099e0afeaf623724e12387aff44..ba59eae392c66354b419bbfd2688a14a26f2e388 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -580,8 +580,8 @@ function(hip_library TARGET_NAME) cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) if(hip_library_SRCS) # FindHIP.cmake defined hip_add_library, HIP_SOURCE_PROPERTY_FORMAT is requried if no .cu files found - if(NOT ${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/operators") - set_source_files_properties(${hip_library_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + if(NOT (${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/operators" OR ${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/phi/kernels")) + set_source_files_properties(${hip_library_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) endif() if (hip_library_SHARED OR hip_library_shared) # build *.so hip_add_library(${TARGET_NAME} SHARED ${hip_library_SRCS}) @@ -651,6 +651,7 @@ function(hip_test TARGET_NAME) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT "LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/python/paddle/libs:$LD_LIBRARY_PATH") endif() endfunction(hip_test) @@ -667,6 +668,7 @@ function(xpu_library TARGET_NAME) else() xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS ${xpu_library_DEPS}) find_fluid_modules(${TARGET_NAME}) + find_phi_modules(${TARGET_NAME}) endif() if (xpu_library_DEPS) add_dependencies(${TARGET_NAME} ${xpu_library_DEPS}) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index c48d31f7e4f90296ecc48acb56e619aae129106e..851bd81403a85e52fbbb3c4c8bf0da1df63c8848 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -114,6 +114,24 @@ function(copy_part_of_thrid_party TARGET DST) endif() endif() + if (WITH_ONNXRUNTIME) + set(dst_dir "${DST}/third_party/install/onnxruntime") + copy(${TARGET} + SRCS ${ONNXRUNTIME_INC_DIR} ${ONNXRUNTIME_LIB_DIR} + DSTS ${dst_dir} ${dst_dir}) + + set(dst_dir "${DST}/third_party/install/paddle2onnx") + if(WIN32) + copy(${TARGET} + SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_SHARED_LIB} ${PADDLE2ONNX_LIB} + DSTS ${dst_dir}/include ${dst_dir}/lib ${dst_dir}/lib) + else() + copy(${TARGET} + SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_LIB} + DSTS ${dst_dir}/include ${dst_dir}/lib) + endif() + endif() + set(dst_dir "${DST}/third_party/install/gflags") copy(${TARGET} SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES} diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 7affd59de162d5956672e5abfbf9f4b287fb7a83..1291e60cfe4ce13ca9aeeb3f8bdf068af0d5832c 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -293,11 +293,11 @@ function(op_library TARGET) # Define operators that don't need pybind here. foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "bitwise_op" "nccl_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op") - - if ("${TARGET}" STREQUAL "${manual_pybind_op}") - set(pybind_flag 1) - endif() - endforeach() + + if ("${TARGET}" STREQUAL "${manual_pybind_op}") + set(pybind_flag 1) + endif() + endforeach() # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h. # Note that it's enough to just adding one operator to pybind in a *_op.cc file. @@ -478,7 +478,7 @@ function(op_library TARGET) if (${pybind_flag} EQUAL 0) # NOTE(*): activation use macro to regist the kernels, set use_op manually. if(${TARGET} STREQUAL "activation") - file(APPEND ${pybind_file} "USE_OP(relu);\n") + file(APPEND ${pybind_file} "USE_OP_ITSELF(relu);\n") elseif(${TARGET} STREQUAL "fake_dequantize") file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n") elseif(${TARGET} STREQUAL "fake_quantize") diff --git a/cmake/phi.cmake b/cmake/phi.cmake index d9132b84455e7309713b99f9e574bfceb83c7b6c..ebb686d8ad0f31917e64161d6f7d2ecd4644fadd 100644 --- a/cmake/phi.cmake +++ b/cmake/phi.cmake @@ -83,6 +83,8 @@ function(kernel_declare TARGET_LIST) file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n") elseif (${kernel_path} MATCHES "./gpudnn\/") file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n") + elseif (${kernel_path} MATCHES "./kps\/") + file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, KPS, ALL_LAYOUT);\n") else () # deal with device independent kernel, now we use CPU temporaary file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n") @@ -97,6 +99,7 @@ function(kernel_library TARGET) set(gpu_srcs) set(xpu_srcs) set(gpudnn_srcs) + set(kps_srcs) set(selected_rows_srcs) # parse and save the deps kerenl targets set(all_srcs) @@ -128,8 +131,11 @@ function(kernel_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc) list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc) endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) - list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) + list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) + endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu) + list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu) endif() endif() if (WITH_XPU) @@ -137,6 +143,15 @@ function(kernel_library TARGET) list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc) endif() endif() + if (WITH_XPU_KP) + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) + # Change XPU2 file suffix + # NOTE(chenweihang): If we can be sure that the *.kps suffix is no longer used, it can be copied directly to *.xpu + file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps) + file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps) + list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps) + endif() + endif() else() # TODO(chenweihang): impl compile by source later endif() @@ -150,6 +165,7 @@ function(kernel_library TARGET) list(APPEND all_srcs ${gpu_srcs}) list(APPEND all_srcs ${xpu_srcs}) list(APPEND all_srcs ${gpudnn_srcs}) + list(APPEND all_srcs ${kps_srcs}) foreach(src ${all_srcs}) file(READ ${src} target_content) string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content}) @@ -159,11 +175,11 @@ function(kernel_library TARGET) string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content}) endif() foreach(include_kernel ${include_kernels}) - if ("${kernel_library_SUB_DIR}" STREQUAL "") - string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel}) - else() - string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel}) - endif() + if ("${kernel_library_SUB_DIR}" STREQUAL "") + string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel}) + else() + string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel}) + endif() string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name}) list(APPEND kernel_deps ${kernel_name}) endforeach() @@ -176,72 +192,93 @@ function(kernel_library TARGET) list(LENGTH gpu_srcs gpu_srcs_len) list(LENGTH xpu_srcs xpu_srcs_len) list(LENGTH gpudnn_srcs gpudnn_srcs_len) + list(LENGTH kps_srcs kps_srcs_len) list(LENGTH selected_rows_srcs selected_rows_srcs_len) - # Build Target according different src organization - if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR - ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) AND - (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)) - # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule. + # kernel source file level + # level 1: base device kernel + # - cpu_srcs / gpu_srcs / xpu_srcs / gpudnn_srcs / kps_srcs + # level 2: device-independent kernel + # - common_srcs + # level 3: Kernel implemented by reusing device-independent kernel + # - selected_rows_srcs + set(base_device_kernels) + set(device_independent_kernel) + set(high_level_kernels) + + # 1. Base device kernel compile + if (${cpu_srcs_len} GREATER 0) + cc_library(${TARGET}_cpu SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_cpu) + endif() + if (${gpu_srcs_len} GREATER 0) if (WITH_GPU) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - nv_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() + nv_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() - else() - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) - cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - cc_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() + hip_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() - # If there are only specific device srcs, build target using this rule. - elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) + list(APPEND base_device_kernels ${TARGET}_gpu) + endif() + if (${xpu_srcs_len} GREATER 0) + cc_library(${TARGET}_xpu SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_xpu) + endif() + if (${gpudnn_srcs_len} GREATER 0) if (WITH_GPU) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() + nv_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() - else() - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) - cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() + hip_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() - # If the selected_rows_srcs depends on common_srcs, build target using this rule. - elseif (${common_srcs_len} GREATER 0 AND ${selected_rows_srcs_len} GREATER 0) + list(APPEND base_device_kernels ${TARGET}_gpudnn) + endif() + if (${kps_srcs_len} GREATER 0) + # only when WITH_XPU_KP, the kps_srcs_len can be > 0 + xpu_library(${TARGET}_kps SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_kps) + endif() + + # 2. Device-independent kernel compile + if (${common_srcs_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + nv_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) elseif (WITH_ROCM) - hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + hip_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) + elseif (WITH_XPU_KP) + xpu_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) else() - cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + cc_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) endif() - # If there are only common_srcs or selected_rows_srcs, build target using below rules. - elseif (${common_srcs_len} GREATER 0) + list(APPEND device_independent_kernel ${TARGET}_common) + endif() + + # 3. Reusing kernel compile + if (${selected_rows_srcs_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + nv_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) elseif (WITH_ROCM) - hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + hip_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) + elseif (WITH_XPU_KP) + xpu_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) else() - cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + cc_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) endif() - elseif (${selected_rows_srcs_len} GREATER 0) + list(APPEND high_level_kernels ${TARGET}_sr) + endif() + + # 4. Unify target compile + list(LENGTH base_device_kernels base_device_kernels_len) + list(LENGTH device_independent_kernel device_independent_kernel_len) + list(LENGTH high_level_kernels high_level_kernels_len) + if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0 OR + ${high_level_kernels_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + nv_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) elseif (WITH_ROCM) - hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + hip_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) + elseif (WITH_XPU_KP) + xpu_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) else() - cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + cc_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) endif() else() set(target_build_flag 0) @@ -249,7 +286,7 @@ function(kernel_library TARGET) if (${target_build_flag} EQUAL 1) if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR - ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR + ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0) # append target into PHI_KERNELS property get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) @@ -275,6 +312,9 @@ function(kernel_library TARGET) if (${gpudnn_srcs_len} GREATER 0) kernel_declare(${gpudnn_srcs}) endif() + if (${kps_srcs_len} GREATER 0) + kernel_declare(${kps_srcs}) + endif() if (${selected_rows_srcs_len} GREATER 0) kernel_declare(${selected_rows_srcs}) endif() diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index ac3eff04d5383ecdf6c771babcaf3e6811600ac3..7df095c6c2ec04e1a694ed2458787af285c96a9a 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -250,6 +250,12 @@ IF(WITH_TESTING OR WITH_DISTRIBUTE) list(APPEND third_party_deps extern_gtest) ENDIF() +if(WITH_ONNXRUNTIME) + include(external/onnxruntime) # download, build, install onnxruntime、paddle2onnx + include(external/paddle2onnx) + list(APPEND third_party_deps extern_onnxruntime extern_paddle2onnx) +endif() + if(WITH_GPU) if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) include(external/cub) # download cub diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index 41652f8b6ed6f717ad8a571be8e7a16408b34504..f88c993d85e2fa6eda27b7e845ee27f08347fa83 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -1,5 +1,12 @@ cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api) +if (WITH_DISTRIBUTE) + cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper) +endif() +cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup) if(WITH_NCCL) cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api) endif() +if(WITH_ASCEND_CL) + cc_library(processgroup_hccl SRCS ProcessGroupHCCL.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api) +endif() diff --git a/paddle/fluid/distributed/collective/HCCLTools.h b/paddle/fluid/distributed/collective/HCCLTools.h new file mode 100644 index 0000000000000000000000000000000000000000..09789bd4d378630f548f931bcac00fda89ef33be --- /dev/null +++ b/paddle/fluid/distributed/collective/HCCLTools.h @@ -0,0 +1,174 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "boost/variant.hpp" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/npu/enforce_npu.h" +#include "paddle/fluid/platform/device/npu/npu_info.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { + +class NPUEventManager { + public: + NPUEventManager() = default; + + ~NPUEventManager() { + if (is_created_) { + platform::NPUDeviceGuard guard(device_index_); + platform::NPUEventDestroy(event_); + } + } + + NPUEventManager(const NPUEventManager&) = delete; + NPUEventManager& operator=(const NPUEventManager&) = delete; + + NPUEventManager(NPUEventManager&& other) { + std::swap(is_created_, other.is_created_); + std::swap(device_index_, other.device_index_); + std::swap(event_, other.event_); + } + + NPUEventManager& operator=(NPUEventManager&& other) { + std::swap(is_created_, other.is_created_); + std::swap(device_index_, other.device_index_); + std::swap(event_, other.event_); + return *this; + } + + bool IsCreated() const { return is_created_; } + bool DeviceId() const { return device_index_; } + aclrtEvent GetRawNPUEvent() const { return event_; } + + void Record(const paddle::platform::NPUDeviceContext& ctx) { + auto device_index = ctx.GetPlace().device; + if (!is_created_) { + CreateEvent(device_index); + } + PADDLE_ENFORCE_EQ(device_index, device_index_, + platform::errors::PreconditionNotMet( + "NPUDeviceContext's device %d does not match" + "Event's device %d", + device_index, device_index_)); + + platform::NPUDeviceGuard guard(device_index_); + platform::NPUEventRecord(event_, ctx.stream()); + } + + bool Query() const { + aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE; + platform::NPUEventQuery(event_, &status); + if (status == ACL_EVENT_STATUS_COMPLETE) { + return true; + } + return false; + } + + void Block(const paddle::platform::NPUDeviceContext& ctx) const { + if (is_created_) { + auto device_index = ctx.GetPlace().device; + PADDLE_ENFORCE_EQ(device_index, device_index_, + platform::errors::PreconditionNotMet( + "CUDADeviceContext's device %d does not match" + "Event's device %d", + device_index, device_index_)); + platform::NPUDeviceGuard guard(device_index_); + platform::NPUStreamWaitEvent(ctx.stream(), event_); + } + } + + private: + bool is_created_{false}; + aclrtEvent event_{}; + int8_t device_index_{0}; + + private: + void CreateEvent(int device_index) { + device_index_ = device_index; + platform::NPUDeviceGuard guard(device_index); + platform::NPUEventCreate(&event_); + is_created_ = true; + } +}; + +class HCCLCommManager { + public: + explicit HCCLCommManager(HcclComm hcclComm) : hccl_comm_(hcclComm) {} + + HCCLCommManager() : HCCLCommManager(nullptr) {} + + ~HCCLCommManager() noexcept { + std::unique_lock lock(mutex_); + if (hccl_comm_) { + platform::dynload::HcclCommDestroy(hccl_comm_); + } + } + + static std::shared_ptr Create(int num_ranks, int rank, + HcclRootInfo* comm_id, + HcclComm hccl_comm) { + auto hccl_manager = std::make_shared(); + auto ret = platform::dynload::HcclCommInitRootInfo(num_ranks, comm_id, rank, + &hccl_comm); + using __NPU_STATUS_TYPE__ = decltype(ret); + constexpr auto __success_type__ = + platform::details::NPUStatusType<__NPU_STATUS_TYPE__>::kSuccess; + if (UNLIKELY(ret != __success_type__)) { + VLOG(0) << "Error: create hccl_id error."; + exit(-1); + } + + hccl_manager->hccl_id_ = comm_id; + hccl_manager->rank_ = rank; + hccl_manager->hccl_comm_ = hccl_comm; + return hccl_manager; + } + + HcclRootInfo* GetHcclId() const { + std::unique_lock lock(mutex_); + return hccl_id_; + } + + HcclComm GetHcclComm() const { + std::unique_lock lock(mutex_); + return hccl_comm_; + } + + HCCLCommManager(const HCCLCommManager&) = delete; + HCCLCommManager& operator=(const HCCLCommManager&) = delete; + HCCLCommManager& operator=(HCCLCommManager&& other) = delete; + + HCCLCommManager(HCCLCommManager&& other) { + std::unique_lock lock(other.mutex_); + std::swap(hccl_comm_, other.hccl_comm_); + } + + protected: + HcclComm hccl_comm_; + HcclRootInfo* hccl_id_; + int rank_; + mutable std::mutex mutex_; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index dde8622d9007e1372739d0fedde4938f85eda323..e43d0e8c183c7005f31b66c4c29dfc95361485e4 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -96,7 +96,54 @@ class ProcessGroup { std::vector& /* tensors */, const BroadcastOptions& = BroadcastOptions()) { PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support allreduce", GetBackendName())); + "ProcessGroup%s does not support broadcast", GetBackendName())); + } + + virtual std::shared_ptr Barrier( + const BarrierOptions& = BarrierOptions()) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support barrier", GetBackendName())); + } + + virtual std::shared_ptr Send( + std::vector& tensors /* tensors */, int dst_rank) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support send", GetBackendName())); + } + + virtual std::shared_ptr Recv( + std::vector& tensors /* tensors */, int src_rank) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support receive", GetBackendName())); + } + + virtual std::shared_ptr AllGather( + std::vector& in_tensors /* tensors */, // NOLINT + std::vector& out_tensors /* tensors */) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support AllGather", GetBackendName())); + } + + virtual std::shared_ptr AllToAll( + std::vector& in /* tensors */, // NOLINT + std::vector& out /* tensors */) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support AllToAll", GetBackendName())); + } + + virtual std::shared_ptr Reduce( + std::vector& tensors /* tensors */, // NOLINT + const ReduceOptions& opts) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support Reduce", GetBackendName())); + } + + virtual std::shared_ptr Scatter( + std::vector& in_tensors /* tensors */, // NOLINT + std::vector& out_tensors /* tensors */, // NOLINT + const ScatterOptions&) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support Scatter", GetBackendName())); } protected: diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc new file mode 100644 index 0000000000000000000000000000000000000000..5dc43af117825bf95407255e93e1e4600e8ddd9a --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc @@ -0,0 +1,502 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#ifdef _WIN32 +#include +#include +#include +#else +#include +#include +#include +#endif + +#include +#include +#include +#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h" +#include "paddle/fluid/framework/fleet/gloo_wrapper.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { + +#ifdef _WIN32 +#define GENERATE_FUNC(type, func, ...) \ + switch (type) { \ + case experimental::DataType::FLOAT32: \ + func(__VA_ARGS__); \ + break; \ + case experimental::DataType::FLOAT64: \ + func(__VA_ARGS__); \ + break; \ + case experimental::DataType::FLOAT16: \ + func(__VA_ARGS__); \ + break; \ + case experimental::DataType::INT32: \ + func(__VA_ARGS__); \ + break; \ + case experimental::DataType::INT64: \ + func(__VA_ARGS__); \ + break; \ + default: \ + VLOG(0) << "Error: Unknown DataType."; \ + exit(-1); \ + } + +#define HOST_NAME_MAX 256 + +#else +#define GENERATE_FUNC(type, func, args...) \ + switch (type) { \ + case experimental::DataType::FLOAT32: \ + func(args); \ + break; \ + case experimental::DataType::FLOAT64: \ + func(args); \ + break; \ + case experimental::DataType::FLOAT16: \ + func(args); \ + break; \ + case experimental::DataType::INT32: \ + func(args); \ + break; \ + case experimental::DataType::INT64: \ + func(args); \ + break; \ + default: \ + VLOG(0) << "Error: Unknown DataType."; \ + exit(-1); \ + } +#endif + +typedef void (*reduce_func)(void*, const void*, const void*, size_t); + +template +reduce_func get_function(const ReduceOp& r) { + switch (r) { + case ReduceOp::SUM: + return reduce_func(&::gloo::sum); + case ReduceOp::PRODUCT: + return reduce_func(&::gloo::product); + case ReduceOp::MIN: + return reduce_func(&::gloo::min); + case ReduceOp::MAX: + return reduce_func(&::gloo::max); + case ReduceOp::AVG: + VLOG(0) << "Error: Unsupported ReduceOp::AVG."; + exit(-1); + } + + VLOG(0) << "Error: Unknown ReduceOp."; + exit(-1); +} + +bool CheckTensorsInCPUPlace(const std::vector& tensors) { + return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) { + return t.place() == PlaceType::kCPU; + }); +} + +template +T* get_data(const Tensor& tensor) { + auto raw_tensor = std::dynamic_pointer_cast(tensor.impl()); + return static_cast(raw_tensor->data()); +} + +template +std::vector get_multi_data(const std::vector& tensors) { + std::vector ret(tensors.size()); + for (size_t i = 0; i < tensors.size(); i++) { + ret[i] = get_data(tensors[i]); + } + return ret; +} + +template +void set_output(P& opts, const Tensor& tensor) { // NOLINT + opts.setOutput(get_data(tensor), tensor.numel()); +} + +template +void set_input(P& opts, const Tensor& tensor) { // NOLINT + opts.setInput(get_data(tensor), tensor.numel()); +} + +template +void set_outputs(P& opts, const std::vector& tensors) { // NOLINT + opts.setOutputs(get_multi_data(tensors), tensors[0].numel()); +} + +template +void set_inputs(P& opts, const std::vector& tensors) { // NOLINT + opts.setInputs(get_multi_data(tensors), tensors[0].numel()); +} + +template +void set_inputs_for_scatter(P& opts, // NOLINT + const std::vector& tensors, // NOLINT + int nranks) { + std::vector ret(nranks); + auto raw_tensor = + std::dynamic_pointer_cast(tensors[0].impl()); + T* raw_pointer = reinterpret_cast(raw_tensor->data()); + size_t offset = 0; + for (int i = 0; i < nranks; i++) { + ret[i] = raw_pointer + offset; + offset += tensors[0].numel() / nranks; + } + opts.setInputs(ret, tensors[0].numel() / nranks); +} + +ProcessGroupGloo::GlooTask::GlooTask(int rank, + const std::vector& inputs, + CommType comm_type) + : ProcessGroup::Task(rank, inputs, comm_type) { + PADDLE_ENFORCE_EQ(CheckTensorsInCPUPlace(inputs), true, + platform::errors::Fatal( + "Only CPU place is supported for ProcessGroupGloo.")); +} + +ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr& store, + int rank, int world_size, + const std::shared_ptr options) + : ProcessGroup(rank, world_size), _tag(0), _store(store) { + _context = std::make_shared(rank, world_size); + auto prefix_store = + ::gloo::rendezvous::PrefixStore(std::to_string(0), *_store); + _context->connectFullMesh(prefix_store, options->device); +} + +class BroadcastGlooTask : public ProcessGroupGloo::GlooTask { + public: + BroadcastGlooTask(const std::shared_ptr& context, + const std::vector& inputs, int rank, int root, + uint32_t tag) + : ProcessGroupGloo::GlooTask(rank, inputs, CommType::BROADCAST), + _context(context), + _root(root), + _inputs(inputs), + _tag(tag) {} + + void Run() override { _do_broadcast(_inputs[0]); } + + private: + std::shared_ptr _context; + const int _root; + std::vector _inputs{}; + const uint32_t _tag; + + void _do_broadcast(const Tensor& tensor) { + gloo::BroadcastOptions opts(_context); + const auto& dtype = tensor.type(); + GENERATE_FUNC(dtype, set_output, opts, tensor); + opts.setRoot(_root); + opts.setTag(_tag); + gloo::broadcast(opts); + } +}; + +std::shared_ptr ProcessGroupGloo::Broadcast( + std::vector& inputs, const BroadcastOptions& opts) { + auto root = opts.source_rank; + std::unique_ptr task; + auto tag = next_tag(); + auto context = get_context(); + task = std::make_unique(context, inputs, rank_, root, tag); + task->Run(); + return task; +} + +class AllreduceGlooTask : public ProcessGroupGloo::GlooTask { + public: + AllreduceGlooTask(int rank, const std::shared_ptr& context, + std::vector& inputs, ReduceOp reduce_op, // NOLINT + uint32_t tag) + : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLREDUCE), + _context(context), + _inputs(inputs), + _reduce_op(reduce_op), + _tag(tag) {} + + void Run() override { _do_allreduce(_inputs); } + + private: + std::shared_ptr _context; + std::vector _inputs; + const ReduceOp _reduce_op; + uint32_t _tag; + + gloo::AllreduceOptions::Func _get_function(const experimental::DataType type, + const ReduceOp op) { + gloo::AllreduceOptions::Func fn; + GENERATE_FUNC(type, _get_function_impl, fn, op); + return fn; + } + + template + void _get_function_impl(gloo::AllreduceOptions::Func& fn, // NOLINT + const ReduceOp op) { + fn = get_function(op); + } + + void _do_allreduce(std::vector& tensors) { // NOLINT + const auto& dtype = tensors[0].type(); + gloo::AllreduceOptions opts(_context); + GENERATE_FUNC(dtype, set_inputs, opts, tensors); + GENERATE_FUNC(dtype, set_outputs, opts, tensors); + opts.setReduceFunction(_get_function(dtype, _reduce_op)); + opts.setTag(_tag); + gloo::allreduce(opts); + } +}; + +std::shared_ptr ProcessGroupGloo::AllReduce( + std::vector& inputs, const AllreduceOptions& opts) { + auto tag = next_tag(); + std::shared_ptr task; + auto context = get_context(); + task = std::make_shared(rank_, context, inputs, + opts.reduce_op, tag); + task->Run(); + return task; +} + +class BarrierGlooTask : public ProcessGroupGloo::GlooTask { + public: + BarrierGlooTask(int rank, const std::shared_ptr& context) + : ProcessGroupGloo::GlooTask(rank, std::vector{}, + CommType::BARRIER), + _context(context) {} + + void Run() override { _do_barrier(); } + + private: + std::shared_ptr _context; + + void _do_barrier() { + gloo::BarrierOptions opts(_context); + gloo::barrier(opts); + } +}; + +std::shared_ptr ProcessGroupGloo::Barrier( + const BarrierOptions& opts) { + std::shared_ptr task; + auto context = get_context(); + task = std::make_shared(rank_, context); + task->Run(); + return task; +} + +class AllgatherGlooTask : public ProcessGroupGloo::GlooTask { + public: + AllgatherGlooTask(int rank, const std::shared_ptr& context, + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + uint32_t tag) + : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLGATHER), + _context(context), + _inputs(inputs), + _outputs(outputs), + _tag(tag) {} + + void Run() override { _do_allgather(_inputs, _outputs); } + + private: + std::shared_ptr _context; + std::vector _inputs; + std::vector _outputs; + uint32_t _tag; + + void _do_allgather(std::vector& in, // NOLINT + std::vector& out) { // NOLINT + const auto& dtype = in[0].type(); + gloo::AllgatherOptions opts(_context); + GENERATE_FUNC(dtype, set_input, opts, in[0]); + GENERATE_FUNC(dtype, set_output, opts, out[0]); + opts.setTag(_tag); + gloo::allgather(opts); + } +}; + +std::shared_ptr ProcessGroupGloo::AllGather( + std::vector& in_tensors, std::vector& out_tensors) { + std::shared_ptr task; + auto tag = next_tag(); + auto context = get_context(); + task = std::make_shared(rank_, context, in_tensors, + out_tensors, tag); + task->Run(); + return task; +} + +class ReduceGlooTask : public ProcessGroupGloo::GlooTask { + public: + ReduceGlooTask(int rank, const std::shared_ptr& context, + std::vector& in, ReduceOp reduce_op, // NOLINT + int dst, uint32_t tag) + : ProcessGroupGloo::GlooTask(rank, in, CommType::REDUCE), + _context(context), + _inputs(in), + _reduce_op(reduce_op), + _dst(dst), + _tag(tag) {} + + void Run() override { _do_reduce(_inputs, _dst); } + + private: + std::shared_ptr _context; + std::vector _inputs; + const ReduceOp _reduce_op; + int _dst; + uint32_t _tag; + + gloo::ReduceOptions::Func _get_function(const experimental::DataType type, + const ReduceOp op) { + gloo::ReduceOptions::Func fn; + GENERATE_FUNC(type, _get_function_impl, fn, op); + return fn; + } + + template + void _get_function_impl(gloo::ReduceOptions::Func& fn, // NOLINT + const ReduceOp op) { + fn = get_function(op); + } + + void _do_reduce(std::vector& tensors, int dst) { // NOLINT + const auto& dtype = tensors[0].type(); + gloo::ReduceOptions opts(_context); + GENERATE_FUNC(dtype, set_input, opts, tensors[0]); + GENERATE_FUNC(dtype, set_output, opts, tensors[0]); + opts.setReduceFunction(_get_function(dtype, _reduce_op)); + opts.setTag(_tag); + opts.setRoot(dst); + gloo::reduce(opts); + } +}; + +std::shared_ptr ProcessGroupGloo::Reduce( + std::vector& tensors, const ReduceOptions& opts) { + std::shared_ptr task; + auto tag = next_tag(); + auto context = get_context(); + task = std::make_shared(rank_, context, tensors, + opts.reduce_op, opts.root_rank, tag); + task->Run(); + return task; +} + +class ScatterGlooTask : public ProcessGroupGloo::GlooTask { + public: + ScatterGlooTask(int rank, const std::shared_ptr& context, + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + int src, int size, uint32_t tag) + : ProcessGroupGloo::GlooTask(rank, inputs, CommType::SCATTER), + _context(context), + _inputs(inputs), + _outputs(outputs), + _src(src), + _size(size), + _tag(tag) {} + + void Run() override { _do_scatter(_inputs, _outputs, _src); } + + private: + std::shared_ptr _context; + std::vector _inputs; + std::vector _outputs; + int _src; + int _size; + uint32_t _tag; + + void _do_scatter(std::vector& in, std::vector& out, // NOLINT + int src) { + const auto& dtype = in[0].type(); + gloo::ScatterOptions opts(_context); + if (rank_ == src) { + GENERATE_FUNC(dtype, set_inputs_for_scatter, opts, in, _size); + } + GENERATE_FUNC(dtype, set_output, opts, out[0]); + opts.setRoot(src); + opts.setTag(_tag); + gloo::scatter(opts); + } +}; + +std::shared_ptr ProcessGroupGloo::Scatter( + std::vector& in_tensors, std::vector& out_tensors, + const ScatterOptions& opts) { + std::shared_ptr task; + auto tag = next_tag(); + auto context = get_context(); + task = std::make_shared( + rank_, context, in_tensors, out_tensors, opts.root_rank, size_, tag); + task->Run(); + return task; +} + +std::shared_ptr<::gloo::transport::Device> +ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) { + ::gloo::transport::tcp::attr attr; + attr.iface = ifname; + return ::gloo::transport::tcp::CreateDevice(attr); +} + +std::shared_ptr<::gloo::transport::Device> +ProcessGroupGloo::createDeviceForHostname(const std::string& hostname) { + ::gloo::transport::tcp::attr attr; + attr.hostname = hostname; + return ::gloo::transport::tcp::CreateDevice(attr); +} + +std::shared_ptr<::gloo::transport::Device> +ProcessGroupGloo::createDefaultDevice() { + std::array hostname{}; + auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX); + PADDLE_ENFORCE_EQ(ret, 0, platform::errors::Fatal( + "Get hostname error for createDefaultDevice.")); + ::addrinfo* result; + result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC); + ::addrinfo* cur; + for (cur = result; cur != nullptr; cur = cur->ai_next) { + SocketType socket = + ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol); + if (socket == -1) { + continue; + } + ret = ::bind(socket, cur->ai_addr, cur->ai_addrlen); +#ifdef _WIN32 + closesocket(socket); +#else + close(socket); +#endif + if (ret == -1) { + continue; + } + break; + } + freeaddrinfo(result); + if (cur != nullptr) { + return createDeviceForHostname(hostname.data()); + } + return createDeviceForHostname("127.0.0.1"); +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h new file mode 100644 index 0000000000000000000000000000000000000000..24f156571a427128f09cd28e632212f47fa4cd47 --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h @@ -0,0 +1,152 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/distributed/collective/ProcessGroup.h" + +#ifdef PADDLE_WITH_GLOO +#include "paddle/fluid/framework/fleet/gloo_wrapper.h" +#endif + +#include "paddle/fluid/distributed/store/store.h" +#include "paddle/fluid/distributed/store/tcp_store.h" + +constexpr const char* GLOO_BACKEND_NAME = "GLOO"; + +namespace paddle { +namespace distributed { + +class ProcessGroupGloo : public ProcessGroup { + public: + class GlooTask : public ProcessGroup::Task, + public std::enable_shared_from_this { + public: + explicit GlooTask(int rank, const std::vector& input_tensors, + CommType comm_type); + + ~GlooTask() = default; + + virtual void Run() = 0; + bool Wait(std::chrono::milliseconds timeout) override { return true; } + bool IsCompleted() override { return true; } + void Synchronize() override {} + + protected: + friend class ProcessGroupGloo; + }; + + class GlooStore : public ::gloo::rendezvous::Store { + public: + explicit GlooStore( + const std::shared_ptr& store) + : _store(store) {} + + ~GlooStore() = default; + + std::vector get(const std::string& key) override { + VLOG(3) << "GlooStore::get"; + auto value = _store->get(key); + return std::vector(value.begin(), value.end()); + } + + void wait(const std::vector& keys) override { + VLOG(3) << "GlooStore::wait"; + for (auto& key : keys) { + _store->wait(key); + } + } + + void set(const std::string& key, const std::vector& value) override { + VLOG(3) << "GlooStore::set"; + std::vector tmp(value.begin(), value.end()); + _store->set(key, tmp); + } + + void wait(const std::vector& keys, + const std::chrono::milliseconds& timeout) override { + VLOG(3) << "GlooStore::wait"; + for (auto& key : keys) { + _store->wait(key); + } + // wait(keys); + } + + protected: + std::shared_ptr _store; + }; + + class GlooOptions { + public: + GlooOptions() = default; + ~GlooOptions() = default; + static std::shared_ptr create() { + return std::make_shared(); + } + std::shared_ptr<::gloo::transport::Device> device; + }; + + explicit ProcessGroupGloo(const std::shared_ptr& store, int rank, + int world_size, + std::shared_ptr options); + + ~ProcessGroupGloo() = default; + + std::shared_ptr Broadcast( + std::vector& inputs, + const BroadcastOptions& = BroadcastOptions()) override; + + std::shared_ptr AllReduce( + std::vector& inputs, + const AllreduceOptions& opts = AllreduceOptions()) override; + + std::shared_ptr Barrier( + const BarrierOptions& = BarrierOptions()) override; + + std::shared_ptr AllGather( + std::vector& in_tensors, + std::vector& out_tensors) override; + + std::shared_ptr Reduce( + std::vector& tensors, const ReduceOptions& opts) override; + + std::shared_ptr Scatter(std::vector& in_tensors, + std::vector& out_tensors, + const ScatterOptions&) override; + + std::shared_ptr<::gloo::Context> get_context() { return _context; } + uint64_t next_tag() { return _tag++; } + + const std::string GetBackendName() const override { + return GLOO_BACKEND_NAME; + } + + // Helper functions for Gloo. + static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname( + const std::string& hostname); + static std::shared_ptr<::gloo::transport::Device> createDeviceForInterface( + const std::string& ifname); + static std::shared_ptr<::gloo::transport::Device> createDefaultDevice(); + + protected: + uint32_t _tag; + std::shared_ptr _context; + std::shared_ptr _store; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc new file mode 100644 index 0000000000000000000000000000000000000000..2deeb7ca03003d0b6c8fa0948afa0a3394639f8b --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc @@ -0,0 +1,354 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/device/npu/hccl_helper.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/api/include/api.h" +#include "paddle/phi/common/place.h" + +DECLARE_bool(hccl_blocking_wait); +// DECLARE_bool(use_stream_safe_npu_allocator); + +constexpr int64_t kWaitBlockTImeout = 10; + +namespace paddle { +namespace distributed { + +static HcclReduceOp ToHCCLRedType(ReduceOp reduction) { + static const std::map red_type = { + {ReduceOp::MIN, HCCL_REDUCE_MIN}, + {ReduceOp::MAX, HCCL_REDUCE_MAX}, + {ReduceOp::SUM, HCCL_REDUCE_SUM}, + {ReduceOp::PRODUCT, HCCL_REDUCE_PROD}, + }; + auto it = red_type.find(reduction); + PADDLE_ENFORCE_EQ( + it != red_type.end(), true, + platform::errors::InvalidArgument("Invalid hccl reduction. " + "Must be Min | Max | Prod | Sum")); + return it->second; +} + +std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID) { + const uint8_t* bytes = reinterpret_cast(&hcclID); + std::ostringstream oss; + for (size_t i = 0; i < sizeof(hcclID); ++i) { + oss << std::hex << static_cast(bytes[i]); + } + return oss.str(); +} + +// Get the list of devices from list of tensors +std::vector GetPlaceList(const std::vector& tensors) { + std::vector places; + places.reserve(tensors.size()); + for (auto& tensor : tensors) { + places.push_back(tensor.inner_place()); + } + return places; +} + +// Get the deviceList String from the list of devices +std::string GetKeyFromPlaces(const std::vector& places) { + std::string placeList; + for (auto& place : places) { + std::stringstream tmp; + tmp << place; + if (placeList.empty()) { + placeList += tmp.str(); + } else { + placeList += "," + tmp.str(); + } + } + return placeList; +} + +// bool CheckTensorsInNPUPlace(const std::vector& tensors) { +// return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) { +// return t.place() == platform::DeviceType::NPU; +// }); +// } + +void SyncDefaultStream( + const std::vector& places, + std::vector& hcclEvents, // NOLINT + std::vector>& dev_ctx) { // NOLINT + for (size_t i = 0; i < places.size(); ++i) { + auto* default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(places[i])); + hcclEvents[i].Record(*dev_ctx[i]); + hcclEvents[i].Block(*default_ctx); + } +} + +std::shared_ptr ProcessGroupHCCL::CreateTask( + std::vector places, int rank, CommType comm_type, + const std::vector& inputs) { + return std::make_shared(places, rank, comm_type, + inputs); +} + +ProcessGroupHCCL::HCCLTask::HCCLTask(const std::vector& places, int rank, + CommType CommType, + const std::vector& inputs) + : Task(rank, inputs, CommType), places_(places) { + control_events_.resize(places.size()); + hcclComms_.resize(places.size()); +} + +ProcessGroupHCCL::HCCLTask::~HCCLTask() {} + +void ProcessGroupHCCL::HCCLTask::SetOutputs( + std::vector& outputs) { // NOLINT + outputs_ = std::make_shared>(outputs); +} + +void ProcessGroupHCCL::HCCLTask::SynchronizeStreams() { + for (size_t i = 0; i < places_.size(); ++i) { + auto* default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(places_[i])); + platform::NPUStreamWaitEvent(default_ctx->stream(), + control_events_[i].GetRawNPUEvent()); + } +} + +bool ProcessGroupHCCL::HCCLTask::IsCompleted() { + for (size_t i = 0; i < places_.size(); ++i) { + if (!control_events_[i].Query()) { + return false; + } + } + + return true; +} + +// TODO(sandyhouse): Add timeout for wait, now timeout unused +bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) { + SynchronizeStreams(); + // NOTE(sandyhouse): It will block host for sync + while (!IsCompleted()) { + std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout)); + } + return true; +} + +// Same as Wait +void ProcessGroupHCCL::HCCLTask::Synchronize() { Wait(kWaitTimeout); } + +ProcessGroupHCCL::ProcessGroupHCCL(const std::shared_ptr& store, + int rank, int size) + : ProcessGroup(rank, size), store_(store) {} + +void ProcessGroupHCCL::BroadcastUniqueHCCLID( + std::vector& hccl_ids) { // NOLINT + if (rank_ == 0) { + for (size_t i = 0; i < hccl_ids.size(); i++) { + auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i); + auto hccl_id = std::vector( + reinterpret_cast(&hccl_ids[i]), + reinterpret_cast(&hccl_ids[i]) + sizeof(HcclRootInfo)); + store_->set(key, hccl_id); + } + } else { + for (size_t i = 0; i < hccl_ids.size(); i++) { + auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i); + auto ret = store_->get(key); + std::memcpy(&hccl_ids[i], ret.data(), ret.size()); + } + } +} + +// create HCCLManager cache for places_key +void ProcessGroupHCCL::CreateHCCLManagerCache( + const std::string& places_key, const std::vector& places) { + PADDLE_ENFORCE_EQ(places_key.empty(), false, + platform::errors::PreconditionNotMet( + "Not able to create/get the HCCL Communicator since " + "the NPU place are not known")); + + std::vector> hccl_comms; + hccl_comms.resize(places.size()); + + // using vector just for broadcast + std::vector hccl_ids; + hccl_ids.resize(1); + auto& hccl_id = hccl_ids.front(); + + if (rank_ == 0) { + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(&hccl_id)); + } + BroadcastUniqueHCCLID(hccl_ids); + + VLOG(3) << "init hccl rank: " << rank_ << ", nranks: " << size_ + << ", place: " << places_key + << ", hccl uniqueid: " << SerializeHCCLUniqueId(hccl_id); + + std::vector> dev_ctx; + dev_ctx.resize(places.size()); + + std::unique_ptr comms(new HcclComm[places.size()]); + for (size_t i = 0; i < places.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + hccl_comms[i] = HCCLCommManager::Create(GetSize(), GetRank(), &hccl_id, + comms.get() + i); + dev_ctx[i].reset(new NPUDeviceContext(places[i])); + } + + std::vector events; + events.resize(places.size()); + + // These caches will be useful to process sync/wait/communicate + places_to_events_.emplace(places_key, std::move(events)); + places_to_hcclcomm_.emplace(places_key, std::move(hccl_comms)); + places_to_ctx_.emplace(places_key, std::move(dev_ctx)); +} + +template +std::shared_ptr ProcessGroupHCCL::Collective( + std::vector& inputs, std::vector& outputs, Fn fn, + CommType op_type) { + const auto places = GetPlaceList(inputs); + const auto key = GetKeyFromPlaces(places); + + { + std::lock_guard lock(mutex_); + if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) { + CreateHCCLManagerCache(key, places); + } + } + + auto& hccl_comms = places_to_hcclcomm_[key]; + + SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); + + auto task = CreateTask(places, rank_, op_type, inputs); + task->SetOutputs(outputs); + + // if (FLAGS_use_stream_safe_npu_allocator) { + // for (size_t i = 0; i < inputs.size(); ++i) { + // platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + // auto dense_tensor = + // std::dynamic_pointer_cast(inputs[i].impl()); + // memory::RecordStream(dense_tensor->Holder(), + // places_to_ctx_[key][i]->stream()); + // } + // } + + for (size_t i = 0; i < inputs.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + const auto& hccl_stream = places_to_ctx_[key][i]->stream(); + fn(inputs[i], outputs[i], hccl_comms[i]->GetHcclComm(), hccl_stream); + } + + for (size_t i = 0; i < inputs.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + task->control_events_[i].Record(*places_to_ctx_[key][i]); + } + return task; +} + +template +std::shared_ptr ProcessGroupHCCL::PointToPoint( + std::vector& tensors, Fn fn, int dst_rank, CommType op_type) { + const auto places = GetPlaceList(tensors); + const auto key = GetKeyFromPlaces(places); + + { + std::lock_guard lock(mutex_); + if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) { + CreateHCCLManagerCache(key, places); + } + } + + auto& hccl_comms = places_to_hcclcomm_[key]; + + SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); + + auto task = CreateTask(places, rank_, op_type, tensors); + + // construct uninitialize guard for device + + // if (FLAGS_use_stream_safe_npu_allocator) { + // for (size_t i = 0; i < tensors.size(); ++i) { + // platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + // auto dense_tensor = + // std::dynamic_pointer_cast(tensors[i].impl()); + // memory::RecordStream(dense_tensor->Holder(), + // places_to_ctx_[key][i]->stream()); + // } + // } + + for (size_t i = 0; i < tensors.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + const auto& hccl_stream = places_to_ctx_[key][i]->stream(); + fn(tensors[i], hccl_comms[i]->GetHcclComm(), hccl_stream, dst_rank); + } + + for (size_t i = 0; i < tensors.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + task->control_events_[i].Record(*places_to_ctx_[key][i]); + } + return task; +} + +std::shared_ptr ProcessGroupHCCL::AllReduce( + std::vector& tensors, const AllreduceOptions& opts) { + // PADDLE_ENFORCE_EQ( + // CheckTensorsInNPUPlace(tensors), true, + // platform::errors::InvalidArgument("All inputs should be in + // NPUPlace.")); + return Collective( + tensors, tensors, + [&](const Tensor& input, Tensor& output, HcclComm comm, + const aclrtStream& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::HcclAllReduce( + input_tensor->data(), output_tensor->data(), input_tensor->numel(), + platform::ToHCCLDataType(input.type()), + ToHCCLRedType(opts.reduce_op), comm, stream); + }, + CommType::ALLREDUCE); +} + +std::shared_ptr ProcessGroupHCCL::Broadcast( + std::vector& tensors, const BroadcastOptions& opts) { + // PADDLE_ENFORCE_EQ( + // CheckTensorsInNPUPlace(tensors), true, + // platform::errors::InvalidArgument("All inputs should be in + // CudaPlace.")); + + return Collective( + tensors, tensors, + [&](Tensor& input, Tensor& output, HcclComm comm, + const aclrtStream& stream) { + const auto root = opts.source_rank * tensors.size() + opts.source_root; + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::HcclBroadcast( + input_tensor->data(), input_tensor->numel(), + platform::ToHCCLDataType(input.type()), root, comm, stream); + }, + CommType::BROADCAST); +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h new file mode 100644 index 0000000000000000000000000000000000000000..83d509be2cdd7b79faf4e2a2f510c34361b94157 --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h @@ -0,0 +1,129 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/platform/device/npu/npu_stream.h" +#include "paddle/fluid/platform/device_context.h" + +#include "paddle/fluid/distributed/collective/HCCLTools.h" +#include "paddle/fluid/distributed/store/store.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" +#include "paddle/fluid/platform/place.h" + +constexpr const char* HCCL_BACKEND_NAME = "HCCL"; + +namespace paddle { +namespace distributed { + +using Place = paddle::platform::Place; +using NPUStream = platform::stream::NPUStream; +using NPUDeviceContext = paddle::platform::NPUDeviceContext; + +class ProcessGroupHCCL : public ProcessGroup { + public: + class HCCLTask : public ProcessGroup::Task, + public std::enable_shared_from_this { + public: + HCCLTask(const std::vector& places, int rank, CommType CommType, + const std::vector& inputs); + + bool IsCompleted(); + + void SynchronizeStreams(); + + bool Wait(std::chrono::milliseconds timeout = kWaitTimeout); + + void Synchronize(); + + void SetOutputs(std::vector& outputs); // NOLINT + + virtual ~HCCLTask(); + + std::vector control_events_; + + protected: + std::vector places_; + std::vector> hcclComms_; + std::shared_ptr> outputs_; + + private: + }; + + ProcessGroupHCCL(const std::shared_ptr& store, int rank, int size); + + const std::string GetBackendName() const override { + return std::string(HCCL_BACKEND_NAME); + } + + std::shared_ptr AllReduce( + std::vector& tensors, + const AllreduceOptions& = AllreduceOptions()) override; + + std::shared_ptr Broadcast( + std::vector& tensors, + const BroadcastOptions& = BroadcastOptions()) override; + + protected: + virtual std::shared_ptr CreateTask( + std::vector places, int rank, CommType opType, + const std::vector& inputs); + + std::shared_ptr store_; + std::shared_ptr hccl_comm_; + std::mutex mutex_; + std::unordered_map>> + places_to_hcclcomm_; + + std::unordered_map> + places_to_events_; + + std::unordered_map>> + places_to_ctx_; + + std::set used_place_ids_; + + private: + void BcastHCCLId(std::vector& hccl_ids, int root, // NOLINT + int server_fd); + + void BroadcastUniqueHCCLID(std::vector& hccl_ids); // NOLINT + + template + std::shared_ptr Collective( + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + Fn fn, CommType op_type); + + template + std::shared_ptr PointToPoint( + std::vector& tensors, // NOLINT + Fn fn, int dst_rank, CommType op_type); + + void CreateHCCLManagerCache(const std::string& places_key, + const std::vector& places); +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index fe2325423b460d7b42e08b03cf9b083bc94fc7b6..67715f410d443c38a1c5d92c560a35a909c5ec1c 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -14,6 +14,9 @@ #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/api/include/api.h" +#include "paddle/phi/common/place.h" DECLARE_bool(nccl_blocking_wait); DECLARE_bool(use_stream_safe_cuda_allocator); @@ -139,42 +142,41 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) { std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout)); } } + + if (!barrierTensors_.empty()) { + // If we use the work to do barrier, we should block cpu + for (auto& place : places_) { + platform::CUDADeviceGuard gpuGuard(place); + PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); + } + } return true; } // Same as Wait void ProcessGroupNCCL::NCCLTask::Synchronize() { Wait(kWaitTimeout); } -ProcessGroupNCCL::ProcessGroupNCCL(const ProcessGroupStrategy& strategy, +ProcessGroupNCCL::ProcessGroupNCCL(const std::shared_ptr& store, int rank, int size) - : ProcessGroup(rank, size), strategy_(strategy) {} - -void ProcessGroupNCCL::BcastNCCLId( - std::vector& nccl_ids, // NOLINT - int root, int server_fd) { - if (strategy_.local_rank_ == root) { - std::vector other_trainers; - for (auto& ep : strategy_.trainer_endpoints_) { - if (ep != strategy_.current_endpoint_) { - other_trainers.push_back(ep); - } - } - platform::SendBroadCastCommID(other_trainers, &nccl_ids); - } else { - platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_, - &nccl_ids); - } -} + : ProcessGroup(rank, size), store_(store) {} void ProcessGroupNCCL::BroadcastUniqueNCCLID( std::vector& nccl_ids) { // NOLINT - - int server_fd = -1; - if (rank_ != 0) { - server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_) - .socket(); + if (rank_ == 0) { + for (size_t i = 0; i < nccl_ids.size(); i++) { + auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(i); + auto nccl_id = std::vector( + reinterpret_cast(&nccl_ids[i]), + reinterpret_cast(&nccl_ids[i]) + NCCL_UNIQUE_ID_BYTES); + store_->set(key, nccl_id); + } + } else { + for (size_t i = 0; i < nccl_ids.size(); i++) { + auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(i); + auto ret = store_->get(key); + std::memcpy(&nccl_ids[i], ret.data(), ret.size()); + } } - BcastNCCLId(nccl_ids, 0, server_fd); } // create NCCLManager cache for places_key @@ -193,13 +195,17 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( nccl_ids.resize(1); auto& nccl_id = nccl_ids.front(); + for (auto& place : places) { + used_place_ids_.insert(place.GetDeviceId()); + } + if (rank_ == 0) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id)); } BroadcastUniqueNCCLID(nccl_ids); - VLOG(3) << "init nccl rank: " << strategy_.local_rank_ - << ", nranks: " << strategy_.nranks_ << ", place: " << places_key + VLOG(3) << "init nccl rank: " << rank_ << ", nranks: " << size_ + << ", place: " << places_key << ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id); std::vector> dev_ctx; @@ -274,6 +280,54 @@ std::shared_ptr ProcessGroupNCCL::Collective( return task; } +template +std::shared_ptr ProcessGroupNCCL::PointToPoint( + std::vector& tensors, Fn fn, int dst_rank, CommType op_type) { + const auto places = GetPlaceList(tensors); + const auto key = GetKeyFromPlaces(places); + + { + std::lock_guard lock(mutex_); + if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) { + CreateNCCLManagerCache(key, places); + } + } + + auto& nccl_comms = places_to_ncclcomm_[key]; + + SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); + + auto task = CreateTask(places, rank_, op_type, tensors); + + // construct uninitialize guard for device + platform::CUDADeviceGuard cuda_guard; + + if (FLAGS_use_stream_safe_cuda_allocator) { + for (size_t i = 0; i < tensors.size(); ++i) { + cuda_guard.SetDevice(places[i]); + auto dense_tensor = + std::dynamic_pointer_cast(tensors[i].impl()); + memory::RecordStream(dense_tensor->Holder(), + places_to_ctx_[key][i]->stream()); + } + } + + { + platform::NCCLGroupGuard nccl_guard; + for (size_t i = 0; i < tensors.size(); ++i) { + cuda_guard.SetDevice(places[i]); + const auto& nccl_stream = places_to_ctx_[key][i]->stream(); + fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank); + } + } + + for (size_t i = 0; i < tensors.size(); ++i) { + cuda_guard.SetDevice(places[i]); + task->control_events_[i].Record(*places_to_ctx_[key][i]); + } + return task; +} + std::shared_ptr ProcessGroupNCCL::AllReduce( std::vector& tensors, const AllreduceOptions& opts) { PADDLE_ENFORCE_EQ( @@ -317,5 +371,241 @@ std::shared_ptr ProcessGroupNCCL::Broadcast( CommType::BROADCAST); } +std::shared_ptr ProcessGroupNCCL::Barrier( + const BarrierOptions& opts) { + std::vector places; + + if (!opts.place_ids.empty()) { + for (auto place_id : opts.place_ids) { + places.emplace_back(place_id); + } + } else if (!used_place_ids_.empty()) { + for (auto place_id : used_place_ids_) { + places.emplace_back(place_id); + } + } else { + auto numGPUs = GetSize(); + int place_id = static_cast(rank_ % numGPUs); + places.emplace_back(place_id); + } + + std::vector barrierTensors; + barrierTensors.reserve(places.size()); + + platform::CUDADeviceGuard gpuGuard; + for (auto& place : places) { + gpuGuard.SetDeviceIndex(place.GetDeviceId()); + auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::Backend::GPU); + barrierTensors.push_back(dt); + } + auto task = ProcessGroupNCCL::AllReduce(barrierTensors); + auto nccl_task = dynamic_cast(task.get()); + nccl_task->barrierTensors_ = std::move(barrierTensors); + return task; +} + +void CheckTensorsInDifferentDevices(const std::vector& tensors, + const size_t num_devices) { + PADDLE_ENFORCE_EQ( + tensors.size() == 0, false, + platform::errors::InvalidArgument("Tensor list must be nonempty.")); + PADDLE_ENFORCE_LE( + tensors.size(), num_devices, + platform::errors::InvalidArgument( + "Tensor list mustn't be larger than the number of available GPUs.")); + + std::set used_devices; + + for (const auto& t : tensors) { + PADDLE_ENFORCE_EQ(t.is_cuda() && t.is_dense_tensor(), true, + platform::errors::InvalidArgument( + "Tensors must be CUDA and dense tensor.")); + + const auto inserted = used_devices.insert(t.inner_place()).second; + PADDLE_ENFORCE_EQ(inserted, true, + platform::errors::InvalidArgument( + "Tensors must be on distinct GPU devices.")); + } +} + +std::shared_ptr ProcessGroupNCCL::Send( + std::vector& tensors, int dst_rank) { + CheckTensorsInDifferentDevices(tensors, static_cast(GetSize())); + + auto task = PointToPoint( + tensors, + [&](Tensor& input, ncclComm_t comm, const gpuStream_t& stream, + int dst_rank) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + return platform::dynload::ncclSend( + input_tensor->data(), input_tensor->numel(), + platform::ToNCCLDataType(input.type()), dst_rank, comm, stream); + }, + dst_rank, CommType::SEND); + return task; +} + +std::shared_ptr ProcessGroupNCCL::Recv( + std::vector& tensors, int src_rank) { + CheckTensorsInDifferentDevices(tensors, static_cast(GetSize())); + + auto task = PointToPoint( + tensors, + [&](Tensor& output, ncclComm_t comm, const gpuStream_t& stream, + int src_rank) { + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::ncclRecv( + output_tensor->data(), output_tensor->numel(), + platform::ToNCCLDataType(output.type()), src_rank, comm, stream); + }, + src_rank, CommType::RECV); + return task; +} + +std::shared_ptr ProcessGroupNCCL::AllGather( + std::vector& in_tensors, std::vector& out_tensors) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), true, + platform::errors::InvalidArgument("All outputs should be in CudaPlace.")); + return Collective( + in_tensors, out_tensors, + [&](const Tensor& input, Tensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::ncclAllGather( + input_tensor->data(), output_tensor->data(), input_tensor->numel(), + platform::ToNCCLDataType(input.type()), comm, stream); + }, + CommType::ALLGATHER); +} + +void* GetPointerByOffset(void* raw_pointer, size_t offset, + experimental::DataType type) { + if (type == experimental::DataType::FLOAT32) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); + } else if (type == experimental::DataType::FLOAT64) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); + } else if (type == experimental::DataType::INT32) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); + } else if (type == experimental::DataType::INT64) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); + } else if (type == experimental::DataType::FLOAT16) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "This datatype in nccl is not supported.")); + } +} + +std::shared_ptr ProcessGroupNCCL::AllToAll( + std::vector& in_tensors, std::vector& out_tensors) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + return Collective( + in_tensors, out_tensors, + [&](const Tensor& input, Tensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + size_t offset = 0; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + for (auto i = 0; i < size_; i++) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( + GetPointerByOffset(input_tensor->data(), offset, input.type()), + input_tensor->numel() / size_, + platform::ToNCCLDataType(input.type()), i, comm, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + GetPointerByOffset(output_tensor->data(), offset, input.type()), + input_tensor->numel() / size_, + platform::ToNCCLDataType(input.type()), i, comm, stream)); + offset += input_tensor->numel() / size_; + } + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + }, + CommType::ALLREDUCE); +} + +std::shared_ptr ProcessGroupNCCL::Reduce( + std::vector& tensors, const ReduceOptions& opts) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + return Collective( + tensors, tensors, + [&](const Tensor& input, Tensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce( + input_tensor->data(), output_tensor->data(), input.numel(), + platform::ToNCCLDataType(input.type()), + ToNCCLRedType(opts.reduce_op), opts.root_rank, comm, stream)); + }, + CommType::REDUCE); +} + +std::shared_ptr ProcessGroupNCCL::Scatter( + std::vector& in_tensors, std::vector& out_tensors, + const ScatterOptions& opts) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + return Collective( + in_tensors, out_tensors, + [&](const Tensor& input, Tensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + size_t offset = 0; + if (rank_ == opts.root_rank) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + for (auto i = 0; i < size_; i++) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( + GetPointerByOffset(input_tensor->data(), offset, input.type()), + input_tensor->numel() / size_, + platform::ToNCCLDataType(input.type()), i, comm, stream)); + offset += input_tensor->numel() / size_; + } + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + output_tensor->data(), input_tensor->numel() / size_, + platform::ToNCCLDataType(input.type()), opts.root_rank, comm, + stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + } else { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + output_tensor->data(), input_tensor->numel() / size_, + platform::ToNCCLDataType(input.type()), opts.root_rank, comm, + stream)); + } + }, + CommType::SCATTER); +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index 9f06566d1c86386acad3758be283e716f46c1951..aa2a2b8fa2088cd30729ba5e6184ef7a9c507bf3 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -25,6 +25,7 @@ #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/distributed/store/store.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/gen_comm_id_helper.h" #include "paddle/fluid/platform/place.h" @@ -65,6 +66,7 @@ class ProcessGroupNCCL : public ProcessGroup { virtual ~NCCLTask(); std::vector control_events_; + std::vector barrierTensors_; protected: std::vector places_; @@ -74,7 +76,7 @@ class ProcessGroupNCCL : public ProcessGroup { private: }; - ProcessGroupNCCL(const ProcessGroupStrategy& strategy, int rank, int size); + ProcessGroupNCCL(const std::shared_ptr& store, int rank, int size); const std::string GetBackendName() const override { return std::string(NCCL_BACKEND_NAME); @@ -88,13 +90,36 @@ class ProcessGroupNCCL : public ProcessGroup { std::vector& tensors, const BroadcastOptions& = BroadcastOptions()) override; + std::shared_ptr Barrier( + const BarrierOptions& = BarrierOptions()) override; + + std::shared_ptr Send(std::vector& tensors, + int dst_rank) override; + + std::shared_ptr Recv(std::vector& tensors, + int src_rank) override; + + std::shared_ptr AllGather( + std::vector& in_tensors, + std::vector& out_tensors) override; + + std::shared_ptr AllToAll( + std::vector& in, std::vector& out) override; + + std::shared_ptr Reduce( + std::vector& tensors, const ReduceOptions& opts) override; + + std::shared_ptr Scatter(std::vector& in_tensors, + std::vector& out_tensors, + const ScatterOptions&) override; + protected: virtual std::shared_ptr CreateTask( std::vector places, int rank, CommType opType, const std::vector& inputs); protected: - ProcessGroupStrategy strategy_; + std::shared_ptr store_; std::shared_ptr nccl_comm_; std::mutex mutex_; std::unordered_map>> @@ -106,6 +131,8 @@ class ProcessGroupNCCL : public ProcessGroup { std::vector>> places_to_ctx_; + std::set used_place_ids_; + private: void BcastNCCLId(std::vector& nccl_ids, int root, // NOLINT int server_fd); @@ -118,6 +145,11 @@ class ProcessGroupNCCL : public ProcessGroup { std::vector& outputs, // NOLINT Fn fn, CommType op_type); + template + std::shared_ptr PointToPoint( + std::vector& tensors, // NOLINT + Fn fn, int dst_rank, CommType op_type); + void CreateNCCLManagerCache(const std::string& places_key, const std::vector& places); }; diff --git a/paddle/fluid/distributed/collective/Types.h b/paddle/fluid/distributed/collective/Types.h index 654d06686957bd4242fa474c215ccf7c117e5910..973f7c643542757c0bce68f8ccdefeadc97f15d4 100644 --- a/paddle/fluid/distributed/collective/Types.h +++ b/paddle/fluid/distributed/collective/Types.h @@ -32,5 +32,18 @@ struct BroadcastOptions { int source_root = 0; }; +struct BarrierOptions { + std::vector place_ids; +}; + +struct ReduceOptions { + ReduceOp reduce_op = ReduceOp::SUM; + int root_rank = 0; +}; + +struct ScatterOptions { + int root_rank = 0; +}; + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc new file mode 100644 index 0000000000000000000000000000000000000000..59f3ea3b0a7d85651e7780b4b11875f19b70931e --- /dev/null +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/collective/reducer.h" +#include "paddle/phi/common/data_type.h" + +namespace paddle { +namespace distributed { + +std::vector> Eager_AssignGroupBySize( + const std::vector tensors, + const std::vector &is_sparse_gradient, + const std::vector &group_size_limits, + const std::vector &tensor_indices) { + PADDLE_ENFORCE_EQ( + tensors.size(), is_sparse_gradient.size(), + platform::errors::PreconditionNotMet( + "tensors len must be equal to is_sparse_gradient len, but " + "[%lu] != [%lu]", + tensors.size(), is_sparse_gradient.size())); + auto check_perm = [](const std::vector &x) -> bool { + size_t len = x.size(); + std::vector cnt(len, 0); + for (size_t i = 0; i < len; ++i) { + if (x[i] >= static_cast(len) || x[i] < 0 || cnt[x[i]]) { + return false; + } + cnt[x[i]]++; + } + return true; + }; + + PADDLE_ENFORCE_EQ(true, check_perm(tensor_indices), + platform::errors::PreconditionNotMet( + "tensor_indices must be a permutation from 0 to %lu", + tensor_indices.size())); + // the return vector + std::vector> res; + + // Key: the var type + // Value: should use which index in group_size_limits for group size limit + std::map group_limit_index; + + // Key: the var type + // Value: + std::map, size_t>> + next_group; + + for (size_t i = 0; i < tensors.size(); ++i) { + const auto &var = tensors[i]; + + size_t tensor_real_index = i; + if (!tensor_indices.empty()) { + tensor_real_index = tensor_indices[i]; + } + + if (is_sparse_gradient[tensor_real_index]) { + // we keep sparse var a single group + res.push_back({tensor_real_index}); + continue; + } + + const auto &var_dtype = var.dtype(); + VLOG(3) << "var[" << var.name() << "] 's type is " << var_dtype; + auto &group_info = next_group[var_dtype]; + + int64_t var_size = -1; + + if (var.is_dense_tensor()) { + var_size = + std::dynamic_pointer_cast(var.impl())->numel(); + } else { + VLOG(3) << "var " << var.name() + << " is not tensor or selected_rows, so skip it"; + continue; + } + + group_info.first.push_back(tensor_real_index); + group_info.second += experimental::SizeOf(var_dtype) * var_size; + // group_info.second += framework::SizeOfType(var_dtype) * var_size; + + if (group_limit_index.find(var_dtype) == group_limit_index.end()) { + // means it is the first var of var_dtype + group_limit_index[var_dtype] = 0; + } + auto &cur_limit_index = group_limit_index[var_dtype]; + if (group_info.second >= group_size_limits[cur_limit_index]) { + // exceed group capacity and create a new group + res.emplace_back(std::move(group_info.first)); + group_info = std::pair, size_t>(); + cur_limit_index = + (std::min)(cur_limit_index + 1, group_size_limits.size() - 1); + } + } + + // add the final groups + for (auto &e : next_group) { + auto &group_info = e.second; + if (!group_info.first.empty()) { + res.emplace_back(std::move(group_info.first)); + } + } + + for (const auto &group_index : res) { + PADDLE_ENFORCE_NE( + group_index.empty(), true, + platform::errors::PreconditionNotMet( + "AssignGroupBySize construct empty group, please check.")); + } + if (tensor_indices.empty()) { + std::sort(res.begin(), res.end(), + [](const std::vector &x, const std::vector &y) { + return x.front() < y.front(); + }); + } + return res; +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h new file mode 100644 index 0000000000000000000000000000000000000000..f8c75385ef8bd6891df8eda6faa93c73091c37f5 --- /dev/null +++ b/paddle/fluid/distributed/collective/reducer.h @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/eager/api/utils/tensor_utils.h" + +namespace paddle { +namespace distributed { +using Tensor = paddle::experimental::Tensor; + +std::vector> Eager_AssignGroupBySize( + const std::vector, const std::vector& is_sparse_gradient, + const std::vector& group_size_limits, + const std::vector& tensor_indices = {}); + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc index 56d8da3eca4b5a82ff6cdb8f4e3ff8638a02b437..0d5d328fd32cc2e12d4f4e94c94dae51f0c040bc 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.cc +++ b/paddle/fluid/distributed/fleet_executor/carrier.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include + #include "paddle/fluid/distributed/fleet_executor/carrier.h" #include "paddle/fluid/distributed/fleet_executor/global.h" #include "paddle/fluid/distributed/fleet_executor/interceptor.h" @@ -46,7 +48,8 @@ void Carrier::Init( const std::unordered_map& interceptor_id_to_rank, const std::unordered_map& interceptor_id_to_node, const framework::ProgramDesc& program, framework::Scope* scope, - int64_t num_micro_batches, const platform::Place& place) { + int64_t num_micro_batches, const platform::Place& place, + const std::vector& inference_root_scope_vars) { rank_ = rank; interceptor_id_to_rank_ = interceptor_id_to_rank; interceptor_id_to_node_ = interceptor_id_to_node; @@ -60,7 +63,7 @@ void Carrier::Init( microbatch_scopes_.resize(num_micro_batches); for (int i = 0; i < num_micro_batches; ++i) { microbatch_scopes_[i] = &minibatch_scope_->NewScope(); - CopyParameters(i, program); + CopyParameters(i, program, inference_root_scope_vars); } // TODO(fleet_exe dev): thread pool @@ -80,12 +83,23 @@ void Carrier::Release() { Carrier::~Carrier() { VLOG(3) << "Carrier's destructor."; } -void Carrier::CopyParameters(int microbatch_id, - const framework::ProgramDesc& program) { +void Carrier::CopyParameters( + int microbatch_id, const framework::ProgramDesc& program, + const std::vector& inference_root_scope_vars) { auto& global_block = program.Block(0); + std::map inference_root_scope_var_map; + for (auto var_name : inference_root_scope_vars) { + inference_root_scope_var_map.insert({var_name, 1}); + } for (auto& var : global_block.AllVars()) { - if (var->Persistable() && microbatch_id == 0) { + std::string var_name = var->Name(); + bool force_root = inference_root_scope_var_map.find(var_name) != + inference_root_scope_var_map.end(); + if (force_root) { + VLOG(4) << var_name << " will be forced to be created in the root scope."; + } + if ((var->Persistable() || force_root) && microbatch_id == 0) { auto* ptr = root_scope_->Var(var->Name()); InitializeVariable(ptr, var->GetType()); VLOG(5) << "Create persistable var: " << var->Name() diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h index 9a74fa78c0e7638cd9c5201b92b06619c1f5b10c..d35a3260915e2cfd40bea9dc03fe6af7d9d04c54 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.h +++ b/paddle/fluid/distributed/fleet_executor/carrier.h @@ -57,9 +57,12 @@ class Carrier final { const std::unordered_map& interceptor_id_to_rank, const std::unordered_map& interceptor_id_to_node, const framework::ProgramDesc& program, framework::Scope* scope, - int64_t num_micro_batches, const platform::Place& place); + int64_t num_micro_batches, const platform::Place& place, + const std::vector& inference_root_scope_vars = {}); - void CopyParameters(int microbatch_id, const framework::ProgramDesc& program); + void CopyParameters( + int microbatch_id, const framework::ProgramDesc& program, + const std::vector& inference_root_scope_vars); void Release(); void Wait(); diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc index 457549a27b4b7ed6305b107cfd319ecae026a53b..e946d78550ff1bb0155843a680fbec33fdca9aa3 100644 --- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc +++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#include #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" #include "paddle/fluid/distributed/fleet_executor/global.h" @@ -52,7 +53,8 @@ void FleetExecutor::Init( const std::string& carrier_id, const framework::ProgramDesc& program_desc, framework::Scope* scope, const platform::Place& place, int64_t num_micro_batches, const std::vector& task_nodes, - const std::unordered_map& task_id_to_rank) { + const std::unordered_map& task_id_to_rank, + const std::vector& inference_root_scope_vars) { PADDLE_ENFORCE_GT(task_nodes.size(), 0, platform::errors::InvalidArgument( "Fleet executor is inited with empty task node")); @@ -64,6 +66,37 @@ void FleetExecutor::Init( } } auto unused_vars = framework::GetUnusedVars(program_desc.Block(0), ops, {}); + // NOTE: For inference, the vars in inference_root_scope_vars + // shouldn't be deleted during inf, for that they may be the result of the + // inf. If they are GCed, it will cause error during ZeroCopy the result. + std::vector changed_ops; + for (auto pair : unused_vars) { + const framework::OperatorBase* op = pair.first; + std::vector unused = pair.second; + for (auto name : inference_root_scope_vars) { + auto iter = std::find(unused.begin(), unused.end(), name); + if (iter != unused.end()) { + VLOG(3) << "Removing var: [" << name + << "] from the unused vars list of op: [" << op->Type() << "]"; + unused.erase(iter); + if (std::find(changed_ops.begin(), changed_ops.end(), op) == + changed_ops.end()) { + // record the op whose unused vars have been updated + changed_ops.emplace_back(op); + } + } + } + // update the unused vars list in the map + unused_vars[op] = unused; + } + for (auto op : changed_ops) { + auto iter = unused_vars.find(op); + if (iter->second.empty()) { + // remove those ops in the map that have empty unused vars list + VLOG(3) << "Removing op: [" << op->Type() << "] from unused_vars map."; + unused_vars.erase(iter); + } + } runtime_graph_ = std::make_shared(); std::unordered_map interceptor_id_to_task; for (auto task_node : task_nodes) { @@ -82,17 +115,18 @@ void FleetExecutor::Init( carrier_ids_.insert(carrier_id); // Set current running carrier GlobalVal::Set(new std::string(carrier_id)); - InitCarrier(carrier, scope, place, num_micro_batches, program_desc); + InitCarrier(carrier, scope, place, num_micro_batches, program_desc, + inference_root_scope_vars); GlobalVal::Get()->Barrier(); } -void FleetExecutor::InitCarrier(Carrier* carrier, framework::Scope* scope, - const platform::Place& place, - int64_t num_micro_batches, - const framework::ProgramDesc& program_desc) { +void FleetExecutor::InitCarrier( + Carrier* carrier, framework::Scope* scope, const platform::Place& place, + int64_t num_micro_batches, const framework::ProgramDesc& program_desc, + const std::vector& inference_root_scope_vars) { carrier->Init(exe_desc_.cur_rank(), runtime_graph_->interceptor_id_to_rank(), runtime_graph_->interceptor_id_to_node(), program_desc, scope, - num_micro_batches, place); + num_micro_batches, place, inference_root_scope_vars); } void FleetExecutor::InitMessageBus() { diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.h b/paddle/fluid/distributed/fleet_executor/fleet_executor.h index fa65309127bec50869c52d2f3c85477910ccb37b..ccdb3dcc459489db9f342a2302fae3d777170313 100644 --- a/paddle/fluid/distributed/fleet_executor/fleet_executor.h +++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.h @@ -42,15 +42,17 @@ class FleetExecutor final { const framework::ProgramDesc& program_desc, framework::Scope* scope, const platform::Place& place, int64_t num_micro_batches, const std::vector& task_nodes, - const std::unordered_map& task_id_to_rank); + const std::unordered_map& task_id_to_rank, + const std::vector& inference_root_scope_vars = {}); void Run(const std::string& carrier_id); private: DISABLE_COPY_AND_ASSIGN(FleetExecutor); void InitMessageBus(); - void InitCarrier(Carrier* carrier, framework::Scope* scope, - const platform::Place& place, int64_t num_micro_batches, - const framework::ProgramDesc& program_desc); + void InitCarrier( + Carrier* carrier, framework::Scope* scope, const platform::Place& place, + int64_t num_micro_batches, const framework::ProgramDesc& program_desc, + const std::vector& inference_root_scope_vars = {}); FleetExecutorDesc exe_desc_; std::shared_ptr runtime_graph_; std::unordered_set carrier_ids_; diff --git a/paddle/fluid/distributed/fleet_executor/task_node.cc b/paddle/fluid/distributed/fleet_executor/task_node.cc index 6de7038b3231f2fb302dd970273c565c5a718b73..95e4c73305998e4190c1547cb2f92809e360b216 100644 --- a/paddle/fluid/distributed/fleet_executor/task_node.cc +++ b/paddle/fluid/distributed/fleet_executor/task_node.cc @@ -52,11 +52,20 @@ void TaskNode::SetProgram(paddle::framework::ProgramDesc* program) { program_ = program; } -void TaskNode::Init() { +void TaskNode::Init(bool use_feed_fetch_ops) { + if (!use_feed_fetch_ops) { + VLOG(3) << "TaskNode will be inited without feed and fetch ops"; + } if (ops_.empty()) { // Q (for fleet executor dev): should we need another reset funct? VLOG(3) << "Task node will be inited by calling Init()."; for (const auto& op_desc : program_->Block(0).AllOps()) { + if (!use_feed_fetch_ops && + (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) { + VLOG(3) << "TaskNode will skip [" << op_desc->Input("X")[0] << "], " + << op_desc->Type() << " -> " << op_desc->Output("Out")[0]; + continue; + } ops_vec_.emplace_back(framework::OpRegistry::CreateOp(*op_desc)); } for (const auto& op : ops_vec_) { diff --git a/paddle/fluid/distributed/fleet_executor/task_node.h b/paddle/fluid/distributed/fleet_executor/task_node.h index b655d140d37a5bdf547a278eec3355ef4638539f..4764d4fd4af87adf3df31f2dabb614da7d719861 100644 --- a/paddle/fluid/distributed/fleet_executor/task_node.h +++ b/paddle/fluid/distributed/fleet_executor/task_node.h @@ -46,7 +46,7 @@ class TaskNode final { ~TaskNode() = default; void SetProgram(paddle::framework::ProgramDesc* program); - void Init(); + void Init(bool use_feed_fetch_ops = true); int64_t rank() const { return rank_; } int64_t task_id() const { return task_id_; } int32_t role() const { return role_; } diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc index 18920d06f38543cc3f7aeb045e7c3058143e006e..ba039385a74ba45aa1f33ba38138d8e5213f2e00 100644 --- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc +++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc @@ -24,10 +24,14 @@ limitations under the License. */ #include "paddle/fluid/distributed/fleet_executor/task_node.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(fill_constant); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/store/store.h b/paddle/fluid/distributed/store/store.h index 2673314d222d2b32e42c42a3a94df71a1887914a..7b4ae7e70ff6f033e038f1c5214f46e0876257d2 100644 --- a/paddle/fluid/distributed/store/store.h +++ b/paddle/fluid/distributed/store/store.h @@ -25,13 +25,26 @@ namespace distributed { class Store { public: - Store() = delete; + Store() : _timeout(tcputils::kNoTimeout) {} explicit Store(const std::chrono::seconds& timeout) : _timeout(timeout) {} virtual ~Store() = default; - virtual int64_t add(const std::string& key, int64_t value) = 0; - virtual std::vector get(const std::string& key) = 0; - virtual void wait(const std::string& key) = 0; + virtual int64_t add(const std::string& key, int64_t value) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Implement the add method in the subclass.")); + } + virtual std::vector get(const std::string& key) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Implement the add method in the subclass.")); + } + virtual void wait(const std::string& key) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Implement the add method in the subclass.")); + } + virtual void set(const std::string& key, const std::vector& value) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Implement the add method in the subclass.")); + } virtual const std::chrono::seconds& timeout() const { return _timeout; } diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc index de85ac0d910e93257a308052ca1fcf193680a183..b0d5add49565ffb19762778ddd44a388b140c0ee 100644 --- a/paddle/fluid/distributed/store/tcp_store.cc +++ b/paddle/fluid/distributed/store/tcp_store.cc @@ -27,11 +27,13 @@ namespace detail { constexpr int INFTIME = -1; -std::unique_ptr MasterDaemon::start(SocketType socket) { - return std::make_unique(socket); +std::unique_ptr MasterDaemon::start(SocketType socket, + int nranks) { + return std::make_unique(socket, nranks); } -MasterDaemon::MasterDaemon(SocketType socket) : _listen_socket(socket) { +MasterDaemon::MasterDaemon(SocketType socket, int nranks) + : _listen_socket(socket), _nranks(nranks) { _background_thread = std::thread{&MasterDaemon::run, this}; } @@ -64,27 +66,35 @@ void MasterDaemon::_do_add(SocketType socket) { tcputils::send_value(socket, new_value); } +void MasterDaemon::_do_set(SocketType socket) { + VLOG(3) << "MasterDaemon::_do_set"; + std::string key = tcputils::receive_string(socket); + auto value = tcputils::receive_vector(socket); + _store[key] = value; +} + void MasterDaemon::_do_get(SocketType socket) { + VLOG(3) << "MasterDaemon::_do_get"; std::string key = tcputils::receive_string(socket); auto iter = _store.find(key); PADDLE_ENFORCE_NE( iter, _store.end(), platform::errors::InvalidArgument("Key %s not found in TCPStore.", key)); std::vector value = iter->second; - VLOG(3) << "TCPStore: value (" - << std::stoll(std::string(reinterpret_cast(value.data()), - value.size())) - << ") for key (" << key << ")."; tcputils::send_vector(socket, value); } void MasterDaemon::_do_stop(SocketType socket) { + VLOG(3) << "MasterDaemon::_do_stop"; ReplyType value = ReplyType::STOP_WAIT; - _stop = true; tcputils::send_value(socket, value); + if (--_nranks == 0) { + _stop = true; + } } void MasterDaemon::_do_wait(SocketType socket) { + VLOG(3) << "MasterDaemon::_do_wait"; std::string key = tcputils::receive_string(socket); auto iter = _store.find(key); auto reply = ReplyType::STOP_WAIT; @@ -126,35 +136,47 @@ void MasterDaemon::run() { } for (size_t i = 1; i < fds.size(); i++) { - if (fds[i].revents == 0) { - continue; - } - - Command command = tcputils::receive_value(fds[i].fd); - VLOG(3) << "TCPStore: recv command: " << static_cast(command) << "."; - - switch (command) { - case Command::ADD: - _do_add(fds[i].fd); - break; - case Command::GET: - _do_get(fds[i].fd); - break; - case Command::WAIT: - _do_wait(fds[i].fd); - break; - case Command::STOP: - _do_stop(fds[i].fd); - break; + try { + if (fds[i].revents == 0) { + continue; + } + + Command command = tcputils::receive_value(fds[i].fd); + VLOG(3) << "TCPStore: recv command: " << static_cast(command) + << "."; + + switch (command) { + case Command::ADD: + _do_add(fds[i].fd); + break; + case Command::GET: + _do_get(fds[i].fd); + break; + case Command::SET: + _do_set(fds[i].fd); + break; + case Command::WAIT: + _do_wait(fds[i].fd); + break; + case Command::STOP: + _do_stop(fds[i].fd); + break; + default: + VLOG(0) << "Unknow command: " << static_cast(command); + exit(-1); + } + } catch (...) { + fds.erase(fds.begin() + i); + _sockets.erase(_sockets.begin() + i - 1); } } } } -std::unique_ptr TCPServer::create(uint16_t port) { +std::unique_ptr TCPServer::create(uint16_t port, int nranks) { int socket = tcputils::tcp_listen("", std::to_string(port), AF_INET); auto server = std::make_unique(); - server->_master_daemon = MasterDaemon::start(socket); + server->_master_daemon = MasterDaemon::start(socket, nranks); return server; } @@ -200,7 +222,7 @@ TCPStore::TCPStore(std::string host, uint16_t port, bool is_master, size_t num_workers, std::chrono::seconds timeout) : Store(timeout), _is_master(is_master), _num_workers(num_workers) { if (_is_master) { - _server = detail::TCPServer::create(port); + _server = detail::TCPServer::create(port, num_workers); } _client = detail::TCPClient::connect(host, port); @@ -213,36 +235,41 @@ void TCPStore::waitWorkers() { } add(_init_key, 1); - if (_server) { - auto begin = std::chrono::steady_clock::now(); - do { - auto value = get(_init_key); - int completed = std::stoi(std::string(value.begin(), value.end())); - VLOG(3) << completed << " worker ready, total " << _num_workers; - if (completed >= _num_workers) { - break; - } - const auto elapsed = std::chrono::duration_cast( - std::chrono::steady_clock::now() - begin); - - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) { - PADDLE_ENFORCE_EQ( - completed, _num_workers, - platform::errors::InvalidArgument( - "TCPStore timeouted and not all workers got ready.")); - } - } while (true); - } + auto begin = std::chrono::steady_clock::now(); + do { + auto value = get(_init_key); + int completed = std::stoi(std::string(value.begin(), value.end())); + VLOG(3) << completed << " worker ready, total " << _num_workers; + if (completed >= _num_workers) { + break; + } + const auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - begin); + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) { + PADDLE_ENFORCE_EQ( + completed, _num_workers, + platform::errors::InvalidArgument( + "TCPStore timeouted and not all workers got ready.")); + } + } while (true); VLOG(3) << "TCPStore initialized."; } int64_t TCPStore::add(const std::string& key, int64_t value) { + VLOG(3) << "TCPStore add."; _client->send_command_for_key(Command::ADD, _key_prefix + key); _client->send_value(value); return _client->receive_value(); } +void TCPStore::set(const std::string& key, const std::vector& value) { + VLOG(3) << "TCPStore set."; + _client->send_command_for_key(Command::SET, _key_prefix + key); + _client->send_vector(value); +} + std::vector TCPStore::get(const std::string& key) { wait(key); _client->send_command_for_key(Command::GET, _key_prefix + key); @@ -252,6 +279,7 @@ std::vector TCPStore::get(const std::string& key) { void TCPStore::wait(const std::string& key) { ReplyType reply; + VLOG(3) << "TCPStore wait."; do { _client->send_command_for_key(Command::WAIT, _key_prefix + key); @@ -261,6 +289,7 @@ void TCPStore::wait(const std::string& key) { } TCPStore::~TCPStore() { + VLOG(3) << "~TCPStore"; _client->send_command_for_key(Command::STOP, ""); ReplyType ret = _client->receive_value(); PADDLE_ENFORCE_EQ(ret, ReplyType::STOP_WAIT, diff --git a/paddle/fluid/distributed/store/tcp_store.h b/paddle/fluid/distributed/store/tcp_store.h index cd706dd6640acf5e0b5b3714175dac7a6cecb25a..17c1d8ea30a421f04d054d59ac93c8c60406ef68 100644 --- a/paddle/fluid/distributed/store/tcp_store.h +++ b/paddle/fluid/distributed/store/tcp_store.h @@ -27,15 +27,16 @@ namespace paddle { namespace distributed { enum class ReplyType { WAITING, STOP_WAIT }; -enum class Command { ADD, GET, WAIT, STOP }; +enum class Command { ADD, GET, SET, WAIT, STOP }; namespace detail { class MasterDaemon { public: - static std::unique_ptr start(SocketType listen_socket); + static std::unique_ptr start(SocketType listen_socket, + int nranks); MasterDaemon() = delete; - explicit MasterDaemon(SocketType listen_socket); + explicit MasterDaemon(SocketType listen_socket, int nranks); ~MasterDaemon(); private: @@ -43,18 +44,20 @@ class MasterDaemon { void _do_add(SocketType socket); void _do_wait(SocketType socket); void _do_get(SocketType socket); + void _do_set(SocketType socket); void _do_stop(SocketType socket); SocketType _listen_socket; std::vector _sockets; std::unordered_map> _store; std::thread _background_thread{}; + int _nranks; bool _stop = false; }; class TCPServer { public: TCPServer() = default; - static std::unique_ptr create(std::uint16_t port); + static std::unique_ptr create(std::uint16_t port, int nranks); private: std::unique_ptr _master_daemon; @@ -97,6 +100,7 @@ class TCPStore : public Store { int64_t add(const std::string& key, int64_t value) override; std::vector get(const std::string& key) override; void wait(const std::string& key) override; + void set(const std::string& key, const std::vector& value) override; private: void waitWorkers(); diff --git a/paddle/fluid/distributed/store/tcp_utils.cc b/paddle/fluid/distributed/store/tcp_utils.cc index d0561d0b9a9c5b01c32620e72d21ed562e42637e..a28cba288333d7f1c2a705049c29b59f43a70cc5 100644 --- a/paddle/fluid/distributed/store/tcp_utils.cc +++ b/paddle/fluid/distributed/store/tcp_utils.cc @@ -46,9 +46,10 @@ void close_socket(SocketType socket) { hints.ai_socktype = SOCK_STREAM; const char* node = host.empty() ? nullptr : host.c_str(); + const char* port_cstr = port.empty() ? nullptr : port.c_str(); int n; - n = ::getaddrinfo(node, port.c_str(), &hints, &res); + n = ::getaddrinfo(node, port_cstr, &hints, &res); const char* gai_err = ::gai_strerror(n); const char* proto = (family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : ""); diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt index 5e16ab2b391d0223a8b6fd9bae78cced9d4e2f11..f9d1b705390cb1c22bf9336292af30363c0010cf 100644 --- a/paddle/fluid/eager/CMakeLists.txt +++ b/paddle/fluid/eager/CMakeLists.txt @@ -1,8 +1,8 @@ -set(eager_deps phi phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node) +set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node) set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy) -set(generated_deps dygraph_function dygraph_node) +set(generated_deps final_dygraph_function final_dygraph_node dygraph_function dygraph_node) -if(NOT ON_INFER) +if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) message("Performing Eager Dygraph Auto Code Generation") add_subdirectory(auto_code_generator) endif() @@ -10,11 +10,11 @@ endif() add_subdirectory(api) add_subdirectory(accumulation) -cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi phi_api) +cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor) cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator) -cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi phi_api) -cc_library(utils SRCS utils.cc DEPS phi phi_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils) +cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi_api phi_tensor) +cc_library(utils SRCS utils.cc DEPS phi_api phi_tensor global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils) cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info) add_subdirectory(tests) diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h index 734cabdc3dc914349e2ad30b657bfb6542a7472a..07fa40165167ce2352018c0e1b1cb08222d5a181 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.h +++ b/paddle/fluid/eager/accumulation/accumulation_node.h @@ -24,11 +24,14 @@ class GradNodeAccumulation : public GradNodeBase { public: // Constructor: configure fwd input tensors to grad node explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) { + VLOG(6) << "Construct GradNodeAccumulation"; weak_grad_ = meta->WeakGrad(); SetDefaultGradInOutMeta(); } - ~GradNodeAccumulation() override = default; + ~GradNodeAccumulation() override { + VLOG(6) << "Destruct GradNodeAccumulation"; + } // Functor: perform backward computations virtual std::vector> operator()( diff --git a/paddle/fluid/eager/api/generated/CMakeLists.txt b/paddle/fluid/eager/api/generated/CMakeLists.txt index ebbef286f7923003295224a38c56c50eb3fa9c5a..4f634c6884b45a83f09348d5cc4749e6272b2a51 100644 --- a/paddle/fluid/eager/api/generated/CMakeLists.txt +++ b/paddle/fluid/eager/api/generated/CMakeLists.txt @@ -1,5 +1,5 @@ add_subdirectory(eager_generated) -if(NOT ON_INFER) +if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) add_subdirectory(fluid_generated) endif() diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt index 77d8ec57efcaa6c4e83a69f4b2a97b128b174389..81ff07b8963f97b8c257e0204c4cdcc0fc82ea63 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(scale_node SRCS scale_node.cc DEPS global_utils phi phi_api grad_node_info) -if(NOT ON_INFER) +if(NOT (NOT WITH_PYTHON AND ON_INFER)) cc_library(final_dygraph_node SRCS nodes.cc DEPS ${eager_deps}) add_dependencies(final_dygraph_node eager_final_state_codegen) endif() diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h index c0150a1730d52b3410ba4ea0d31674fbfed596ae..247fde6ed1f869542969b068cdae9f59cedd732a 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h @@ -46,7 +46,7 @@ class GradNodeScale : public GradNodeBase { const std::vector& tensors); void SetAttributes_scale(float scale); - + std::string name() override { return ""; } // Members: define fwd input tensors // For Scale there is no fwd input tensor needed private: diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt index 60b35340eabd1fa03f59cc0b7ea278351be96df1..c70bb80c35c78ca476c8612d804bdd1e9b3838ff 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(eager_scale SRCS scale.cc DEPS phi_api phi autograd_meta scale_node) -if(NOT ON_INFER) +if(NOT (NOT WITH_PYTHON AND ON_INFER)) cc_library(final_dygraph_function SRCS dygraph_functions.cc DEPS ${eager_deps}) add_dependencies(final_dygraph_function eager_final_state_codegen) endif() diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc index c7927716300528fdfa571de720ce12e7246b5f1d..9abd7be49d44cbab4b3482961df461dd7164328f 100644 --- a/paddle/fluid/eager/api/utils/hook_utils.cc +++ b/paddle/fluid/eager/api/utils/hook_utils.cc @@ -52,49 +52,44 @@ void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor, } } -static void RetainGradForRegularNode( - const paddle::experimental::Tensor& tensor) { - AutogradMeta* meta = EagerUtils::unsafe_autograd_meta(tensor); - if (meta->RetainGrads()) { +void RetainGradForTensor(const paddle::experimental::Tensor& tensor) { + if (IsLeafTensor(tensor)) { + // Leaf tensor's grad will always be retained + // Refer to implementation of AccumulationNode for more details return; } else { - meta->SetRetainGrads(true); - } + AutogradMeta* meta = EagerUtils::unsafe_autograd_meta(tensor); + if (meta->RetainGrads()) { + return; + } else { + meta->SetRetainGrads(true); + } - std::weak_ptr weak_grad_tensor = - meta->WeakGrad(); + std::weak_ptr weak_grad_tensor = + meta->WeakGrad(); - // Define Hook - auto hook = [weak_grad_tensor](const paddle::experimental::Tensor& t) { - if (!weak_grad_tensor.expired()) { - auto grad_tensor = weak_grad_tensor.lock(); - if (t.defined()) { - VLOG(7) << "Set impl for RetainGrad Hook for tensor: " << t.name(); - // Simply Copy impl() to grad_tensor - grad_tensor->set_impl(t.impl()); - return *grad_tensor.get(); + // Define Hook + auto hook = [weak_grad_tensor](const paddle::experimental::Tensor& t) { + if (!weak_grad_tensor.expired()) { + auto grad_tensor = weak_grad_tensor.lock(); + if (t.defined()) { + VLOG(7) << "Set impl for RetainGrad Hook for tensor: " << t.name(); + // Simply Copy impl() to grad_tensor + grad_tensor->set_impl(t.impl()); + return *grad_tensor.get(); + } else { + VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook"; + return paddle::experimental::Tensor(); + } } else { VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook"; return paddle::experimental::Tensor(); } - } else { - VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook"; - return paddle::experimental::Tensor(); - } - }; + }; - // Append to GradientHooks - RegisterGradientHookForTensor(tensor, - std::make_shared(hook)); -} - -void RetainGradForTensor(const paddle::experimental::Tensor& tensor) { - if (IsLeafTensor(tensor)) { - // Leaf tensor's grad will always be retained - // Refer to implementation of AccumulationNode for more details - return; - } else { - RetainGradForRegularNode(tensor); + // Append to GradientHooks + RegisterGradientHookForTensor(tensor, + std::make_shared(hook)); } } diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index a8e0ed7a41a043e12332ad347f673a6c27e5f1ec..dc79a8a45a246798551a0bcce8c487f67183220b 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -47,6 +47,9 @@ std::unordered_map> static std::unordered_map operators_with_attrs = {}; +/* --- Black Ops list that's NO NEED to apply code generation --- */ +static std::unordered_set black_ops_list = {"run_program"}; + static std::string LegalizeVariableName(const std::string& var_name) { std::string ret = var_name; std::replace(ret.begin(), ret.end(), '-', '_'); // replace all '-' to '_' @@ -73,12 +76,6 @@ static bool IgnoreGradAttribute(const std::string& op_type, } static void PrepareAttrMapForOps() { - // Handle "run_program_op" - static framework::ProgramDesc fake_prog; - operators_with_attrs["run_program"] = {}; - operators_with_attrs["run_program"]["global_block"] = - fake_prog.MutableBlock(0); - // Handle "fused_elemwise_add_activation" std::vector functor_list = {"a", "b"}; operators_with_attrs["fused_elemwise_add_activation"] = {}; @@ -996,6 +993,29 @@ static std::string GenerateGradNodeCreationContent( // then generate: "egr::AutogradMeta* p_autograd_out = // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")" std::string get_autograd_meta_str = " // Prepare Autograd Meta \n"; + // If single output slotname and not duplicable, + // then generate: "egr::AutogradMeta* p_autograd_out = + // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")" + for (const proto::OpProto::Var& output : out_vars) { + const std::string& output_name = output.name(); + const std::string& output_autograd_name = "p_autograd_" + output_name; + + if (output.duplicable()) { + const char* GET_MULTI_AUTOGRAD_META_TEMPLATE = + " std::vector %s = " + "egr::EagerUtils::autograd_meta(&%s);\n"; + get_autograd_meta_str += paddle::string::Sprintf( + GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); + } else { + const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = + " egr::AutogradMeta* %s = " + "egr::EagerUtils::autograd_meta(&%s);\n"; + get_autograd_meta_str += paddle::string::Sprintf( + GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); + } + } + VLOG(6) << "Generated outputs autograd_meta"; + for (const proto::OpProto::Var& input : in_vars) { const std::string& input_name = input.name(); const std::string& input_autograd_name = "p_autograd_" + input_name; @@ -1024,31 +1044,6 @@ static std::string GenerateGradNodeCreationContent( } VLOG(6) << "Generated inputs autograd_meta"; - // If single output slotname and not duplicable, - // then generate: "egr::AutogradMeta* p_autograd_out = - // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")" - for (const proto::OpProto::Var& output : out_vars) { - const std::string& output_name = output.name(); - const std::string& output_autograd_name = "p_autograd_" + output_name; - - // Skip Intermediate Tensor - - if (output.duplicable()) { - const char* GET_MULTI_AUTOGRAD_META_TEMPLATE = - " std::vector %s = " - "egr::EagerUtils::autograd_meta(&%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( - GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); - } else { - const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = - " egr::AutogradMeta* %s = " - "egr::EagerUtils::autograd_meta(&%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( - GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); - } - } - VLOG(6) << "Generated outputs autograd_meta"; - std::string prepare_autograd_meta_str = ""; prepare_autograd_meta_str += get_autograd_meta_str; prepare_autograd_meta_str += "\n"; @@ -1156,11 +1151,13 @@ static std::string GenerateGradNodeCreationContent( grad_node_creation_str += paddle::string::Sprintf( SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position); - const char* SET_HISTORY_TEMPLATE = - " egr::EagerUtils::SetHistory(&%s, grad_node);\n"; - grad_node_creation_str += - paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name); - + // Intermediate Tensor does not require SetHistory + if (!output.intermediate()) { + const char* SET_HISTORY_TEMPLATE = + " egr::EagerUtils::SetHistory(&%s, grad_node);\n"; + grad_node_creation_str += + paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name); + } const char* SET_GRAD_IN_META_TEMPLATE = " grad_node->SetGradInMeta(&%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( @@ -1173,17 +1170,20 @@ static std::string GenerateGradNodeCreationContent( grad_node_creation_str += paddle::string::Sprintf( SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position); - const char* SET_HISTORY_TEMPLATE = - " egr::EagerUtils::SetHistory(%s, grad_node);\n"; - grad_node_creation_str += - paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name); - + // Intermediate Tensor does not require SetHistory + if (!output.intermediate()) { + const char* SET_HISTORY_TEMPLATE = + " egr::EagerUtils::SetHistory(%s, grad_node);\n"; + grad_node_creation_str += + paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name); + } const char* SET_GRAD_IN_META_TEMPLATE = " grad_node->SetGradInMeta(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position); } + // Intermediate Tensor does not require CheckAndRetainGrad if (!output.intermediate()) { VLOG(6) << "Generated Call RetainGradForTensor"; const char* RETAIN_GRAD_TEMPLATE = @@ -1199,11 +1199,12 @@ static std::string GenerateGradNodeCreationContent( " %s" " bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n" " if(require_any_grad) {\n" + " VLOG(6) << \" Construct Grad for %s \"; \n" " egr::EagerUtils::PassStopGradient(%s);\n" "%s\n }"; std::string grad_node_creation_body_str = paddle::string::Sprintf( GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str, - compute_require_grad_args, pass_stop_gradient_args, + compute_require_grad_args, op_type, pass_stop_gradient_args, grad_node_creation_str); return grad_node_creation_body_str; @@ -2078,22 +2079,24 @@ static std::string GenerateGradNodeHeaderContents( const char* GRAD_NODE_TEMPLATE = "class GradNode%s : public egr::GradNodeBase {\n" " public:\n" - " GradNode%s() : egr::GradNodeBase() {}\n" + " GradNode%s() : egr::GradNodeBase() { VLOG(7) << \" Construct " + "GradNode%s \"; }\n" " GradNode%s(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : " - "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}\n" - " ~GradNode%s() override = default;\n" + "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) { VLOG(7) << \" " + "Construct GradNode%s \"; }\n" + " ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n" "\n" " virtual std::vector> " "operator()(const " "std::vector>& grads) " "override;\n" "\n" + " std::string name() override { return \" GradNode%s \"; } \n " + "\n" " // SetX, SetY, ...\n" "%s\n" " // SetAttrMap\n" "%s\n" - " std::string name() { return \"GradNode%s\"; }\n" - "\n" " private:\n" " // TensorWrappers\n" "%s\n" @@ -2190,8 +2193,8 @@ static std::string GenerateGradNodeHeaderContents( VLOG(6) << "Generated TensorWrapper"; std::string grad_node_str = paddle::string::Sprintf( - GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, - set_tensor_wrappers_str, set_attr_map_str, op_type, + GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type, + op_type, op_type, set_tensor_wrappers_str, set_attr_map_str, tensor_wrapper_members_str, attr_members_str); return grad_node_str; @@ -2343,6 +2346,9 @@ static void DygraphCodeGeneration(const std::string& output_dir) { if (!CheckOpProto(op_proto)) continue; const std::string& op_type = op_proto->type(); + if (black_ops_list.count(op_type)) { + continue; + } /* ----------------------------- */ /* ---- Collect Information ---- */ diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt index c6bca01205e19c58d5924f4e9d60bb76164fee2b..53af6c1048d2454b1e9f375b837103930026ae54 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt @@ -1,5 +1,5 @@ -set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml") -set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml") +set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml") +set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml") set(tmp_forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc") set(tmp_forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.h") set(tmp_nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.cc") diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index c6e56e34627a52bc19df7e8d87371811fcec8697..967891fe5227dcd6129c0ef1808fba7720711568 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -23,6 +23,20 @@ core_ops_returns_info = {} core_ops_args_info = {} core_ops_args_type_info = {} +namespace = "" + +yaml_types_mapping = { + 'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t', 'size_t' : 'size_t', \ + 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ + 'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \ + 'int64[]' : 'std::vector', 'int[]' : 'std::vector', + 'Tensor' : 'Tensor', + 'Tensor[]' : 'std::vector', + 'Tensor[Tensor[]]' : 'std::vector>', + 'Scalar' : 'paddle::experimental::Scalar', + 'ScalarArray' : 'paddle::experimental::ScalarArray' +} + def ParseArguments(): parser = argparse.ArgumentParser( @@ -59,7 +73,9 @@ def IsPlainTensorType(string): def IsVectorTensorType(string): - vector_tensor_types = ['list(Tensor)'] + vector_tensor_types = [ + 'std::vector>', 'std::vector' + ] if string in vector_tensor_types: return True return False @@ -110,6 +126,7 @@ def GetAutoGradMetaVectorName(string): def ReadFwdFile(filepath): f = open(filepath, 'r') contents = yaml.load(f, Loader=yaml.FullLoader) + f.close() return contents @@ -118,9 +135,13 @@ def ReadBwdFile(filepath): contents = yaml.load(f, Loader=yaml.FullLoader) ret = {} for content in contents: - assert 'backward_api' in content.keys() - api_name = content['backward_api'] + if 'backward_api' in content.keys(): + api_name = content['backward_api'] + else: + assert False + ret[api_name] = content + f.close() return ret @@ -180,6 +201,9 @@ def ParseYamlArgs(string): arg_name = m.group(3).split("=")[0].strip() default_value = m.group(3).split("=")[1].strip() if len( m.group(3).split("=")) > 1 else None + + assert arg_type in yaml_types_mapping.keys() + arg_type = yaml_types_mapping[arg_type] if "Tensor" in arg_type: assert default_value is None inputs_list.append([arg_name, arg_type, i]) @@ -190,35 +214,30 @@ def ParseYamlArgs(string): def ParseYamlReturns(string): - # Example: Tensor, Tensor + # Example0: Tensor(out), Tensor(out1) + # Example1: Tensor, Tensor + # Example2: Tensor[](out), Tensor - # list = [ ["", ret_type, orig_position], ...] + # list = [ [ret_name, ret_type, orig_position], ...] returns_list = [] returns = [x.strip() for x in string.strip().split(",")] + for i in range(len(returns)): ret = returns[i] - returns_list.append(["", ret, i]) - - return returns_list + ret_name = "" + if "(" in ret and ")" in ret: + # Remove trailing ')' + ret = ret[:-1] + ret_type = ret.split("(")[0].strip() + ret_name = ret.split("(")[1].strip() + else: + ret_type = ret.strip() -def ParseYamlReturnsWithName(string): - # Example: Tensor(out), Tensor(out1) - - # list = [ [ret_name, ret_type, orig_position], ...] - returns_list = [] - - returns = [x.strip() for x in string.strip().split(",")] + assert ret_type in yaml_types_mapping.keys() + ret_type = yaml_types_mapping[ret_type] - atype = r'(.*?)' - aname = r'(.*?)' - pattern = f'{atype}\({aname}\)' - for i in range(len(returns)): - ret = returns[i] - m = re.search(pattern, ret) - ret_type = m.group(1) - ret_name = m.group(2) assert "Tensor" in ret_type returns_list.append([ret_name, ret_type, i]) @@ -240,7 +259,7 @@ def ParseYamlForwardFromBackward(string): function_returns = m.group(3) forward_inputs_list, forward_attrs_list = ParseYamlArgs(function_args) - forward_returns_list = ParseYamlReturnsWithName(function_returns) + forward_returns_list = ParseYamlReturns(function_returns) return forward_inputs_list, forward_attrs_list, forward_returns_list @@ -270,7 +289,7 @@ def ParseYamlBackward(args_str, returns_str): args_str = re.search(args_pattern, args_str).group(1) inputs_list, attrs_list = ParseYamlArgs(args_str) - returns_list = ParseYamlReturnsWithName(returns_str) + returns_list = ParseYamlReturns(returns_str) return inputs_list, attrs_list, returns_list @@ -496,11 +515,18 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, set_attribute_methods_str += SET_ATTR_METHOD_TEMPLATE.format( aname, GetConstReference(atype), aname, saved_attr_name, aname) - ATTRIBUTE_MEMBER_TEMPLATE = """ - {} {} = {}; -""" - attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format( - RemoveConstAndReference(atype), saved_attr_name, default_val) + if default_val: + ATTRIBUTE_MEMBER_TEMPLATE = """ + {} {} = {}; + """ + attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format( + RemoveConstAndReference(atype), saved_attr_name, default_val) + else: + ATTRIBUTE_MEMBER_TEMPLATE = """ + {} {}; + """ + attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format( + RemoveConstAndReference(atype), saved_attr_name) # End: SetAttributes & Attribute Members grad_node_name = GetGradNodeName(fwd_api_name) @@ -514,7 +540,7 @@ class {} : public egr::GradNodeBase {{ virtual std::vector> operator()( const std::vector>& grads) override; - + std::string name() override {{ return \" {} \"; }} // SetTensorWrapperX, SetTensorWrapperY, ... {} // SetAttributes @@ -529,8 +555,9 @@ class {} : public egr::GradNodeBase {{ """ node_declaration_str = NODE_DECLARATION_TEMPLATE.format( grad_node_name, grad_node_name, grad_node_name, grad_node_name, - set_tensor_wrapper_methods_str, set_attribute_methods_str, - tensor_wrapper_members_str, attribute_members_str) + grad_node_name, set_tensor_wrapper_methods_str, + set_attribute_methods_str, tensor_wrapper_members_str, + attribute_members_str) return node_declaration_str @@ -587,16 +614,23 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map, returns_str += f"return returns;\n" grad_node_name = GetGradNodeName(fwd_api_name) + + if len(namespace) > 0: + grad_api_namespace = f"paddle::experimental::{namespace}" + else: + grad_api_namespace = f"paddle::experimental" + FUNCTION_TEMPLATE = """ std::vector> {}::operator()(const std::vector>& grads) {{ // Call grad_api function - auto grad_api_returns = paddle::experimental::{}({}); + auto grad_api_returns = {}::{}({}); {} }} """ node_definition_str = FUNCTION_TEMPLATE.format( - grad_node_name, bwd_api_name, grad_api_args_str, returns_str) + grad_node_name, grad_api_namespace, bwd_api_name, grad_api_args_str, + returns_str) return node_definition_str @@ -650,7 +684,7 @@ def GenerateNodeCreationCodes( else: # Tuple api_result if IsPlainTensorType(rtype): - outputs_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);" + output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);" else: assert IsVectorTensorType(rtype) output_autograd_meta = f" std::vector {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);\n" @@ -678,18 +712,24 @@ def GenerateNodeCreationCodes( # SetTensorWrappers set_tensor_wrappers_list = [] - for name, (_, is_fwd_input, _) in backward_fwd_input_map.items(): + for name, (atype, is_fwd_input, pos) in backward_fwd_input_map.items(): is_optional = (name in optional_inputs) + if is_fwd_input: if is_optional: set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);" else: set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);" else: + if IsVectorTensorType(atype): + tw_name = f"api_result[{pos}]" + else: + tw_name = f"api_result" + if is_optional: - set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, false);" + set_tensor_wrappers = f" if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);" else: - set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, false);" + set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({tw_name}, false);" set_tensor_wrappers_list.append(set_tensor_wrappers) set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list) @@ -829,7 +869,11 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, function_name = fwd_api_name else: function_name = fwd_api_name + "_intermediate" - forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});" + + if len(namespace) > 0: + forward_call_str = f"auto api_result = paddle::experimental::{namespace}::{function_name}({inputs_call_args_str});" + else: + forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});" # Get return type list & outputs num_outputs = len(forward_outputs_position_map.keys()) - len( @@ -979,7 +1023,9 @@ def GenerateNodeCCFile(filepath, node_definition_str): #include "paddle/fluid/eager/utils.h" #include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" +#include "paddle/fluid/eager/to_static/run_program_op_node.h" +#include "paddle/phi/api/include/sparse_api.h" """ file_contents += node_definition_str with open(filepath, 'a') as f: @@ -1000,11 +1046,12 @@ def GenerateNodeHFile(filepath, node_declaration_str): def GenerateForwardCCFile(filepath, forward_definition_str): file_contents = """ +#include "paddle/phi/api/lib/dygraph_api.h" #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" +#include "paddle/phi/api/include/sparse_api.h" #include "paddle/fluid/eager/api/utils/global_utils.h" - """ file_contents += GenerateCoreOpInfoDefinition() @@ -1021,6 +1068,7 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str): #include "paddle/phi/api/all.h" #include "paddle/fluid/eager/utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/eager/to_static/run_program_op_func.h" """ file_contents += GenerateCoreOpInfoDeclaration() @@ -1032,134 +1080,184 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str): if __name__ == "__main__": args = ParseArguments() - api_yaml_path = args.api_yaml_path - backward_yaml_path = args.backward_yaml_path - - fwd_api_list = ReadFwdFile(api_yaml_path) - grad_api_dict = ReadBwdFile(backward_yaml_path) + api_yaml_paths = args.api_yaml_path.split(",") + backward_yaml_paths = args.backward_yaml_path.split(",") # Generate per Dygraph API node_declaration_str = "" node_definition_str = "" forward_definition_str = "" forward_declaration_str = "" - for fwd_api in fwd_api_list: - # We only generate Ops with grad - if 'backward' not in fwd_api.keys(): - continue - assert 'api' in fwd_api.keys() - assert 'args' in fwd_api.keys() - assert 'output' in fwd_api.keys() - assert 'backward' in fwd_api.keys() - - no_need_buffer_set = set() - if 'no_need_buffer' in fwd_api.keys(): - no_need_buffer_set = ParseNoNeedBuffer(fwd_api['no_need_buffer']) - - fwd_api_name = fwd_api['api'] - fwd_args_str = fwd_api['args'] - fwd_returns_str = fwd_api['output'] - - bwd_api_name = fwd_api['backward'] - assert bwd_api_name in grad_api_dict.keys() - bwd_api = grad_api_dict[bwd_api_name] - - assert 'args' in bwd_api.keys() - assert 'output' in bwd_api.keys() - assert 'forward' in bwd_api.keys() - - # Parse Dispensable Inputs - optional_inputs = [] - if 'optional' in fwd_api.keys(): - optional_inputs = ParseDispensable(fwd_api['optional']) - - bwd_forward_str = bwd_api['forward'] - bwd_args_str = bwd_api['args'] - bwd_returns_str = bwd_api['output'] - - # Collect Forward Inputs/Outputs - forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward( - bwd_forward_str) - print("Parsed Forward Inputs List: ", forward_inputs_list) - print("Prased Forward Attrs List: ", forward_attrs_list) - print("Parsed Forward Returns List: ", forward_returns_list) - - intermediate_outputs = [] - if 'intermediate' in fwd_api.keys(): - intermediate_outputs = ParseIntermediate(fwd_api['intermediate']) - - IntermediateValidationCheck(intermediate_outputs, forward_returns_list) - - # Collect Original Forward Inputs/Outputs and then perform validation checks - orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward( - fwd_args_str, fwd_returns_str) - print("Parsed Original Forward Inputs List: ", orig_forward_inputs_list) - print("Prased Original Forward Attrs List: ", orig_forward_attrs_list) - print("Parsed Original Forward Returns List: ", - orig_forward_returns_list) - - # Forward Validation Checks - ForwardsValidationCheck(forward_inputs_list, forward_attrs_list, - forward_returns_list, orig_forward_inputs_list, - orig_forward_attrs_list, - orig_forward_returns_list) - - # Parse Backward Inputs/Outputs - backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward( - bwd_args_str, bwd_returns_str) - print("Parsed Backward Inputs List: ", backward_inputs_list) - print("Prased Backward Attrs List: ", backward_attrs_list) - print("Parsed Backward Returns List: ", backward_returns_list) - - # Determine Forward Inputs/Outputs Position - forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( - forward_inputs_list, forward_returns_list) - print("Generated Forward Input Position Map: ", - forward_inputs_position_map) - print("Generated Forward Output Position Map: ", - forward_outputs_position_map) - - # SlotName Matching - backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching( - backward_inputs_list, backward_returns_list, - forward_inputs_position_map, forward_outputs_position_map) - print("Generated Backward Fwd Input Map: ", backward_fwd_input_map) - print("Generated Backward Grad Input Map: ", backward_grad_input_map) - print("Generated Backward Grad Output Map: ", backward_grad_output_map) - - # Backward Validation Check - BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map, - backward_attrs_list) - - # Node Declaration Generation - node_declaration_str += GenerateNodeDeclaration( - fwd_api_name, backward_fwd_input_map, backward_attrs_list, - no_need_buffer_set) - print("Generated Node Declaration: ", node_declaration_str) - - node_definition_str += GenerateNodeDefinition( - fwd_api_name, bwd_api_name, backward_fwd_input_map, - backward_grad_input_map, backward_grad_output_map, - backward_attrs_list) - print("Generated Node Definition: ", node_definition_str) - - # Node Definition Generation - definition_declaration_pair = GenerateForwardDefinition( - fwd_api_name, bwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, forward_attrs_list, - backward_fwd_input_map, backward_grad_input_map, - backward_grad_output_map, backward_attrs_list, optional_inputs, - intermediate_outputs) - print("Generated Forward Definition: ", forward_definition_str) - print("Generated Forward Declaration: ", forward_declaration_str) - forward_definition_str += definition_declaration_pair[0] - forward_declaration_str += definition_declaration_pair[1] - - # For python-level API dispatch - CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, - forward_attrs_list) + for i in range(len(api_yaml_paths)): + api_yaml_path = api_yaml_paths[i] + backward_yaml_path = backward_yaml_paths[i] + + if "sparse" in api_yaml_path: + assert "sparse" in backward_yaml_path + namespace = "sparse" + else: + namespace = "" + + fwd_api_list = ReadFwdFile(api_yaml_path) + grad_api_dict = ReadBwdFile(backward_yaml_path) + + yaml_forward_definition_str = "" + yaml_forward_declaration_str = "" + yaml_node_declaration_str = "" + yaml_node_definition_str = "" + for fwd_api in fwd_api_list: + # We only generate Ops with grad + if 'backward' not in fwd_api.keys(): + continue + + assert 'api' in fwd_api.keys() + assert 'args' in fwd_api.keys() + assert 'output' in fwd_api.keys() + assert 'backward' in fwd_api.keys() + + no_need_buffer_set = set() + if 'no_need_buffer' in fwd_api.keys(): + no_need_buffer_set = ParseNoNeedBuffer(fwd_api[ + 'no_need_buffer']) + + fwd_api_name = fwd_api['api'] + fwd_args_str = fwd_api['args'] + fwd_returns_str = fwd_api['output'] + + bwd_api_name = fwd_api['backward'] + assert bwd_api_name in grad_api_dict.keys() + bwd_api = grad_api_dict[bwd_api_name] + + assert 'args' in bwd_api.keys() + assert 'output' in bwd_api.keys() + assert 'forward' in bwd_api.keys() + + # Parse Dispensable Inputs + optional_inputs = [] + if 'optional' in fwd_api.keys(): + optional_inputs = ParseDispensable(fwd_api['optional']) + + bwd_forward_str = bwd_api['forward'] + bwd_args_str = bwd_api['args'] + bwd_returns_str = bwd_api['output'] + + # Collect Forward Inputs/Outputs + forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward( + bwd_forward_str) + print("Parsed Forward Inputs List: ", forward_inputs_list) + print("Prased Forward Attrs List: ", forward_attrs_list) + print("Parsed Forward Returns List: ", forward_returns_list) + + intermediate_outputs = [] + if 'intermediate' in fwd_api.keys(): + intermediate_outputs = ParseIntermediate(fwd_api[ + 'intermediate']) + + IntermediateValidationCheck(intermediate_outputs, + forward_returns_list) + + # Collect Original Forward Inputs/Outputs and then perform validation checks + orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward( + fwd_args_str, fwd_returns_str) + print("Parsed Original Forward Inputs List: ", + orig_forward_inputs_list) + print("Prased Original Forward Attrs List: ", + orig_forward_attrs_list) + print("Parsed Original Forward Returns List: ", + orig_forward_returns_list) + + # Forward Validation Checks + ForwardsValidationCheck( + forward_inputs_list, forward_attrs_list, forward_returns_list, + orig_forward_inputs_list, orig_forward_attrs_list, + orig_forward_returns_list) + + # Parse Backward Inputs/Outputs + backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward( + bwd_args_str, bwd_returns_str) + print("Parsed Backward Inputs List: ", backward_inputs_list) + print("Prased Backward Attrs List: ", backward_attrs_list) + print("Parsed Backward Returns List: ", backward_returns_list) + + # Determine Forward Inputs/Outputs Position + forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( + forward_inputs_list, forward_returns_list) + print("Generated Forward Input Position Map: ", + forward_inputs_position_map) + print("Generated Forward Output Position Map: ", + forward_outputs_position_map) + + # SlotName Matching + backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching( + backward_inputs_list, backward_returns_list, + forward_inputs_position_map, forward_outputs_position_map) + print("Generated Backward Fwd Input Map: ", backward_fwd_input_map) + print("Generated Backward Grad Input Map: ", + backward_grad_input_map) + print("Generated Backward Grad Output Map: ", + backward_grad_output_map) + + # Backward Validation Check + BackwardValidationCheck(backward_fwd_input_map, + backward_grad_input_map, + backward_attrs_list) + + # Node Declaration Generation + yaml_node_declaration_str += GenerateNodeDeclaration( + fwd_api_name, backward_fwd_input_map, backward_attrs_list, + no_need_buffer_set) + print("Generated Node Declaration: ", node_declaration_str) + + yaml_node_definition_str += GenerateNodeDefinition( + fwd_api_name, bwd_api_name, backward_fwd_input_map, + backward_grad_input_map, backward_grad_output_map, + backward_attrs_list) + print("Generated Node Definition: ", node_definition_str) + + # Node Definition Generation + definition_declaration_pair = GenerateForwardDefinition( + fwd_api_name, bwd_api_name, forward_inputs_position_map, + forward_outputs_position_map, forward_attrs_list, + backward_fwd_input_map, backward_grad_input_map, + backward_grad_output_map, backward_attrs_list, optional_inputs, + intermediate_outputs) + print("Generated Forward Definition: ", forward_definition_str) + print("Generated Forward Declaration: ", forward_declaration_str) + yaml_forward_definition_str += definition_declaration_pair[0] + yaml_forward_declaration_str += definition_declaration_pair[1] + + # For python-level API dispatch + CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map, + forward_outputs_position_map, + forward_attrs_list) + + if len(namespace) > 0: + forward_definition_str += f"""namespace {namespace} {{ + {yaml_forward_definition_str} +}} +""" + + forward_declaration_str += f"""namespace {namespace} {{ + {yaml_forward_declaration_str} +}} +""" + + node_declaration_str += f"""namespace {namespace} {{ + {yaml_node_declaration_str} +}} +""" + + node_definition_str += f"""namespace {namespace} {{ + {yaml_node_definition_str} +}} +""" + + else: + forward_definition_str += yaml_forward_definition_str + forward_declaration_str += yaml_forward_declaration_str + node_declaration_str += yaml_node_declaration_str + node_definition_str += yaml_node_definition_str # Generate Files nodes_h_path = args.nodes_h_path diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index 9329dc5ffc9dd0faa36b8ff6a8373387bc2678c7..eee32a2c5057d523212a4faa5eca8678e961f417 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -14,34 +14,28 @@ import os import argparse -from eager_gen import ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap +from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap + +skipped_fwd_api_names = set(["scale"]) atype_to_parsing_function = { "bool": "CastPyArg2Boolean", "int": "CastPyArg2Int", "long": "CastPyArg2Long", + "int64_t": "CastPyArg2Long", "float": "CastPyArg2Float", "string": "CastPyArg2String", - "bool[]": "CastPyArg2Booleans", - "int[]": "CastPyArg2Ints", - "long[]": "CastPyArg2Longs", - "float[]": "CastPyArg2Floats", - "double[]": "CastPyArg2Float64s", - "string[]": "CastPyArg2Strings" -} - -atype_to_cxx_type = { - "bool": "bool", - "int": "int", - "long": "long", - "float": "float", - "string": "std::string", - "bool[]": "std::vector", - "int[]": "std::vector", - "long[]": "std::vector", - "float[]": "std::vector", - "double[]": "std::vector", - "string[]": "std::vector" + "std::vector": "CastPyArg2Booleans", + "std::vector": "CastPyArg2Ints", + "std::vector": "CastPyArg2Longs", + "std::vector": "CastPyArg2Longs", + "std::vector": "CastPyArg2Floats", + "std::vector": "CastPyArg2Float64s", + "std::vector": "CastPyArg2Strings", + "paddle::experimental::Scalar": "CastPyArg2Scalar", + "paddle::experimental::ScalarArray": "CastPyArg2ScalarArray", + "paddle::experimental::Backend": "CastPyArg2Backend", + "paddle::experimental::DataType": "CastPyArg2DataType", } @@ -55,15 +49,9 @@ def ParseArguments(): return args -def GetCxxType(atype): - if atype not in atype_to_cxx_type.keys(): - assert False - - return atype_to_cxx_type[atype] - - def FindParsingFunctionFromAttributeType(atype): if atype not in atype_to_parsing_function.keys(): + print(f"Unable to find {atype} in atype_to_parsing_function.") assert False return atype_to_parsing_function[atype] @@ -71,7 +59,7 @@ def FindParsingFunctionFromAttributeType(atype): def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, forward_attrs_list, forward_outputs_position_map, - optional_inputs): + optional_inputs, is_forward_only): # forward_inputs_position_map = { "name" : [type, fwd_position] } # forward_outputs_position_map = { "name" : [type, fwd_position] } # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...] @@ -98,11 +86,10 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, # Get Attributes for name, atype, _, pos in forward_attrs_list: parsing_function = FindParsingFunctionFromAttributeType(atype) - cxx_type = GetCxxType(atype) key = f"{name}" parse_attributes_str += f" PyObject* {name}_obj = PyTuple_GET_ITEM(args, {pos});\n" - parse_attributes_str += f" {cxx_type} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n" + parse_attributes_str += f" {atype} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n" dygraph_function_call_list[pos] = f"{name}" dygraph_function_call_str = ",".join(dygraph_function_call_list) @@ -139,11 +126,20 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj }} """ + namespace_str = "" + if len(namespace) > 0: + namespace_str = f"{namespace}::" + + if is_forward_only: + fwd_function_name = "paddle::experimental::" + namespace_str + fwd_api_name + else: + fwd_function_name = namespace_str + GetForwardFunctionName(fwd_api_name) + python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str, - GetForwardFunctionName(fwd_api_name), dygraph_function_call_str) + fwd_function_name, dygraph_function_call_str) - python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n" + python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void)) {namespace_str}eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n" return python_c_function_str, python_c_function_reg_str @@ -197,7 +193,7 @@ static PyObject * eager_get_final_state_core_ops_returns_info(PyObject *self) { """ core_ops_infos_registry = """ - ,{\"get_final_state_core_ops_args_info\", + {\"get_final_state_core_ops_args_info\", (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS, \"C++ interface function for eager_get_final_state_core_ops_args_info.\"}, {\"get_final_state_core_ops_args_type_info\", @@ -225,6 +221,13 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str): #pragma once #include "pybind11/detail/common.h" +#include "paddle/phi/api/all.h" +#include "paddle/phi/api/lib/dygraph_api.h" +#include "paddle/phi/common/backend.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/api/include/sparse_api.h" #include "paddle/fluid/pybind/op_function_common.h" #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/pybind/exception.h" @@ -257,53 +260,80 @@ def GeneratePythonCFile(filepath, python_c_str): if __name__ == "__main__": args = ParseArguments() - api_yaml_path = args.api_yaml_path - fwd_api_list = ReadFwdFile(api_yaml_path) - - python_c_function_list = [] - python_c_function_reg_list = [] - for fwd_api in fwd_api_list: - # We only generate Ops with grad - if 'backward' not in fwd_api.keys(): - continue - - assert 'api' in fwd_api.keys() - assert 'args' in fwd_api.keys() - assert 'output' in fwd_api.keys() - assert 'backward' in fwd_api.keys() - - fwd_api_name = fwd_api['api'] - fwd_args_str = fwd_api['args'] - fwd_returns_str = fwd_api['output'] - - # Parse Dispensable Inputs - optional_inputs = [] - if 'optional' in fwd_api.keys(): - optional_inputs = ParseDispensable(fwd_api['optional']) - - # Collect Original Forward Inputs/Outputs and then perform validation checks - forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward( - fwd_args_str, fwd_returns_str) - print("Parsed Original Forward Inputs List: ", forward_inputs_list) - print("Prased Original Forward Attrs List: ", forward_attrs_list) - print("Parsed Original Forward Returns List: ", forward_returns_list) - - forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( - forward_inputs_list, forward_returns_list) - print("Generated Forward Input Position Map: ", - forward_inputs_position_map) - print("Generated Forward Output Position Map: ", - forward_outputs_position_map) - - python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction( - fwd_api_name, forward_inputs_position_map, forward_attrs_list, - forward_outputs_position_map, optional_inputs) - python_c_function_list.append(python_c_function_str) - python_c_function_reg_list.append(python_c_function_reg_str) - print("Generated Python-C Function: ", python_c_function_str) - - python_c_functions_str = "\n".join(python_c_function_list) - python_c_functions_reg_str = ",\n".join(python_c_function_reg_list) + api_yaml_paths = args.api_yaml_path.split(",") + + python_c_functions_reg_str = "" + python_c_functions_str = "" + + for i in range(len(api_yaml_paths)): + api_yaml_path = api_yaml_paths[i] + + if "sparse" in api_yaml_path: + namespace = "sparse" + else: + namespace = "" + + fwd_api_list = ReadFwdFile(api_yaml_path) + + python_c_function_list = [] + python_c_function_reg_list = [] + for fwd_api in fwd_api_list: + + # We only generate Ops with grad + is_forward_only = False + if 'backward' not in fwd_api.keys(): + is_forward_only = True + + assert 'api' in fwd_api.keys() + assert 'args' in fwd_api.keys() + assert 'output' in fwd_api.keys() + + fwd_api_name = fwd_api['api'] + fwd_args_str = fwd_api['args'] + fwd_returns_str = fwd_api['output'] + + if fwd_api_name in skipped_fwd_api_names: + continue + + # Parse Dispensable Inputs + optional_inputs = [] + if 'optional' in fwd_api.keys(): + optional_inputs = ParseDispensable(fwd_api['optional']) + + # Collect Original Forward Inputs/Outputs and then perform validation checks + forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward( + fwd_args_str, fwd_returns_str) + print("Parsed Original Forward Inputs List: ", forward_inputs_list) + print("Prased Original Forward Attrs List: ", forward_attrs_list) + print("Parsed Original Forward Returns List: ", + forward_returns_list) + + forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( + forward_inputs_list, forward_returns_list) + print("Generated Forward Input Position Map: ", + forward_inputs_position_map) + print("Generated Forward Output Position Map: ", + forward_outputs_position_map) + + python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction( + fwd_api_name, forward_inputs_position_map, forward_attrs_list, + forward_outputs_position_map, optional_inputs, is_forward_only) + python_c_function_list.append(python_c_function_str) + python_c_function_reg_list.append(python_c_function_reg_str) + print("Generated Python-C Function: ", python_c_function_str) + + # Append Namespace + python_c_functions_reg_str += ",\n".join( + python_c_function_reg_list) + "," + python_c_functions = "\n".join(python_c_function_list) + if len(namespace) > 0: + python_c_functions_str += f"""namespace {namespace} {{ + {python_c_functions} +}} +""" + + else: + python_c_functions_str += python_c_functions python_c_str = GeneratePythonCWrappers(python_c_functions_str, python_c_functions_reg_str) diff --git a/paddle/fluid/eager/autograd_meta.h b/paddle/fluid/eager/autograd_meta.h index 9e1dc4f2c8c6ba5c1c7d0c49e5d141d1a6c4c6d3..dca76d3b8a0db8c4284960005bfbad33ce23e20d 100644 --- a/paddle/fluid/eager/autograd_meta.h +++ b/paddle/fluid/eager/autograd_meta.h @@ -145,8 +145,7 @@ class AutogradMeta : public AbstractAutogradMeta { private: // TODO(jiabin) :Should we use pointer instead of object? std::shared_ptr grad_{ - std::make_shared( - egr::Controller::Instance().GenerateUniqueName("@grad"))}; + std::make_shared()}; // GradNodeBase is base class of all grad op which is a // wrapper for grad op. This class will make grad op easy diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 7073ca8f0527ba8237da734db0c8724baa2a49ec..934497d7d179c1732bde68c147ed86661c25ddae 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -48,12 +48,16 @@ std::unordered_map getInDegreeMap( } visited.insert(node); + PADDLE_ENFORCE_NOT_NULL( + node, + paddle::platform::errors::Fatal( + "We got null node when we traverse the backward graph, and this " + "should not happened please check your code and contact us.")); // Find and append next nodes const std::vector>& edges = node->GetEdges(); for (const auto& edge_list : edges) { for (const Edge& edge : edge_list) { GradNodeBase* next_node = edge.GetMutableGradNode().get(); - // Next node could be nullptr if it is leaf tensor with no // AccumulationNode attached // Or it could also originated from dispensable inputs @@ -67,7 +71,6 @@ std::unordered_map getInDegreeMap( } } } - return node_in_degree_map; } @@ -221,10 +224,11 @@ void RunBackward(const std::vector& tensors, << " 's name is: " << grad_output_tensor.name(); auto* next_node = next_node_shared.get(); - if (!node_input_buffers_dict.count(next_node)) { - node_input_buffers_dict[next_node] = - std::make_unique(next_node->InputMeta()); + const auto& input_meta = next_node->InputMeta(); + auto grad_tensor_holder = + std::make_unique(input_meta); + node_input_buffers_dict[next_node] = std::move(grad_tensor_holder); } VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first << ", rank: " << edge_rank.second; diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index 35416281f188892ec11413a19abad9b3e5c29e76..427be83c3bbee31eaa0c7e3d26d2d9599b344450 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -30,6 +30,7 @@ namespace egr { GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) { + VLOG(6) << "Construct GradNodeBase"; bwd_in_meta_.resize(bwd_in_slot_num); bwd_out_meta_.resize(bwd_out_slot_num); // adj_edges has the same num as backward outputs @@ -49,11 +50,15 @@ void GradNodeBase::AddEdges(std::vector* metas, size_t slot_id) { // its pre-ops if (meta && !meta->StopGradient()) { auto node = meta->GetMutableGradNode(); - if (node) { + if (node && node.get()) { + VLOG(6) << "Add Edges for slot: " << slot_id + << " which is: " << meta->GetMutableGradNode()->name(); adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); } else { meta->SetGradNode(std::make_shared(meta)); + VLOG(6) << "Add Edges for slot: " << slot_id + << " which is: " << meta->GetMutableGradNode()->name(); adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); } @@ -70,7 +75,7 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) { "inputs's slot num.")); if (meta && !meta->StopGradient()) { auto node = meta->GetMutableGradNode(); - if (node) { + if (node && node.get()) { VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " << this->name() << " to " << meta->GetMutableGradNode()->name(); adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), @@ -244,7 +249,7 @@ GradNodeBase::ApplyGradientHooks( if (!out.defined() || !out.initialized()) { out = (*hook)(tensors[slot_id][rank]); } else { - // If more than one hook is registered, the input to the next hook func + // If more than one hook is registered, the input to the next hook func // should be the output of the previous hook out = (*hook)(out); } diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h index eeac1cca4acf33190ce30613e4a86e99a95b651b..16513f05e0777a8e57f54c925d68867dda656612 100644 --- a/paddle/fluid/eager/grad_node_info.h +++ b/paddle/fluid/eager/grad_node_info.h @@ -76,10 +76,10 @@ class GradSlotMeta { class GradNodeBase { public: - GradNodeBase() = default; + GradNodeBase() { VLOG(6) << "Construct GradNodeBase"; } GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num); // TODO(jiabin): Should we have other constructor here? - virtual ~GradNodeBase() = default; + virtual ~GradNodeBase() { VLOG(6) << "Destruct GradNodeBase"; } /** * operator() designed to contian the real backward execution logic, it should diff --git a/paddle/fluid/eager/tests/CMakeLists.txt b/paddle/fluid/eager/tests/CMakeLists.txt index c1506d8139b432c93d0bed35073b404192a927f6..2bfb9937c8c9167d712535dca71ef02efa1f3f78 100644 --- a/paddle/fluid/eager/tests/CMakeLists.txt +++ b/paddle/fluid/eager/tests/CMakeLists.txt @@ -1,6 +1,6 @@ add_subdirectory(data_structure_tests) add_subdirectory(task_tests) -if(NOT ON_INFER) +if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) add_subdirectory(performance_tests) endif() diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h index bb84e2dda81bafe624fe7734a0a47391eeb0adfa..535c93ac53b1751d9634476e47f32dc0cbe22708 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h @@ -30,6 +30,7 @@ class GradTestNode : public egr::GradNodeBase { GradTestNode(float val, int in_num, int out_num) : GradNodeBase(in_num, out_num), val_(val) {} GradTestNode() : GradNodeBase() { val_ = 1.0; } + std::string name() override { return "GradTestNode"; } std::vector> operator()( const std::vector>& grads) override { diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc index 8c6eeca9d3d5d80fd5bfe943ef87ba8640ada4f2..384fdcd6f97c4b318341db68cdd88b644d42d22a 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc @@ -24,6 +24,8 @@ #include "paddle/phi/core/kernel_registry.h" +PD_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT); + // TODO(jiabin): remove nolint here!!! using namespace egr; // NOLINT diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc index 6c4bf9a4f17e6f88503f0a1d6ec2f3029000b6f0..adb3246ee8c808c9f62fde0228f40cccb2f9ac88 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc @@ -33,6 +33,14 @@ #include "gperftools/profiler.h" #endif +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); + using namespace egr; // NOLINT using namespace egr_utils_api; // NOLINT @@ -72,6 +80,47 @@ TEST(Benchmark, EagerScaleCPU) { } } +TEST(Benchmark, EagerMatmulCPU) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + for (const std::string& mode : {"Accuracy", "Performance"}) { + paddle::framework::DDim ddimX = phi::make_ddim({2, 2}); + paddle::experimental::Tensor X = CreateTensorWithValue( + ddimX, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0, true); + RetainGradForTensor(X); + + paddle::framework::DDim ddimY = phi::make_ddim({2, 2}); + paddle::experimental::Tensor Y = CreateTensorWithValue( + ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 2.0, true); + RetainGradForTensor(Y); + + if (mode == "Accuracy") { + benchmark_eager_matmul(X, Y, true /* accuracy_check */); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("eager_matmul_cpu.out"); +#endif + benchmark_eager_matmul(X, Y); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} + TEST(Benchmark, EagerIntermediateMatmulCPU) { // Prepare Device Contexts eager_test::InitEnv(paddle::platform::CPUPlace()); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc index 14e7ce8cfcfb4dea0907cd128873223c8e5859a2..bd70e84d9b461490f53ac6692d55860da1bfc9d8 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc @@ -32,11 +32,19 @@ #include "gperftools/profiler.h" #endif +#include "paddle/phi/core/kernel_registry.h" + using namespace egr; // NOLINT using namespace egr_utils_api; // NOLINT #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); + TEST(Benchmark, EagerScaleCUDA) { eager_test::InitEnv(paddle::platform::CUDAPlace()); @@ -74,6 +82,50 @@ TEST(Benchmark, EagerScaleCUDA) { } } +TEST(Benchmark, EagerMatmulCUDA) { + paddle::platform::CUDAPlace place; + eager_test::InitEnv(place); + + for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { + paddle::framework::DDim ddimX = phi::make_ddim({2, 2}); + paddle::experimental::Tensor X = CreateTensorWithValue( + ddimX, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0, true); + RetainGradForTensor(X); + + paddle::framework::DDim ddimY = phi::make_ddim({2, 2}); + paddle::experimental::Tensor Y = CreateTensorWithValue( + ddimY, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 2.0, true); + RetainGradForTensor(Y); + + if (mode == "Accuracy") { + benchmark_eager_matmul(X, Y, true /* accuracy_check */); + + } else if (mode == "WarmUp") { + benchmark_eager_matmul(X, Y); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("eager_matmul_cuda.out"); +#endif + benchmark_eager_matmul(X, Y); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} + TEST(Benchmark, EagerIntermediateMatmulCUDA) { paddle::platform::CUDAPlace place; eager_test::InitEnv(place); @@ -186,7 +238,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) { USE_OP_ITSELF(scale); USE_OP_ITSELF(matmul_v2); USE_OP_ITSELF(reduce_sum); -USE_OP(reduce_sum_grad); +USE_OP_ITSELF(reduce_sum_grad); USE_OP_ITSELF(elementwise_add); #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc index 3292de9363696dae30d853980eca6fb1ba1055cc..a9d297c1c64f7b64373237a0500802a5c883aedd 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc @@ -34,6 +34,14 @@ #include "gperftools/profiler.h" #endif +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); + namespace paddle { namespace imperative { diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc index e9b7d10070dbf22f10e617d34f143992d19fb659..bd9eaa09ca9a406da943c8a0b0f37b674d5ea3c2 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc @@ -34,8 +34,16 @@ #include "gperftools/profiler.h" #endif +#include "paddle/phi/core/kernel_registry.h" + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); + namespace paddle { namespace imperative { @@ -248,7 +256,7 @@ TEST(Benchmark, FluidMLPCUDA) { USE_OP_ITSELF(scale); USE_OP_ITSELF(matmul_v2); USE_OP_ITSELF(reduce_sum); -USE_OP(reduce_sum_grad); +USE_OP_ITSELF(reduce_sum_grad); USE_OP_ITSELF(elementwise_add); #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc index 96126fa5466aace442dfb742f9902539916b853e..769bd7f687f4584d44bbfa30b73611a3128289bf 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc @@ -28,6 +28,7 @@ #include "paddle/fluid/eager/utils.h" // Eager Generated +#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" // Fluid @@ -67,6 +68,29 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor, } } +void benchmark_eager_matmul(const paddle::experimental::Tensor& X, + const paddle::experimental::Tensor& Y, + bool accuracy_check) { + paddle::experimental::Tensor input_tensor0 = X; + + size_t max_num_runs = accuracy_check ? 2 : max_num_benchmark_runs; + for (size_t i = 0; i < max_num_runs; i++) { + input_tensor0 = + matmul_final_state_dygraph_function(input_tensor0, Y, false, false); + } + + std::vector target_tensors = {input_tensor0}; + RunBackward(target_tensors, {}); + + if (accuracy_check) { + // Examine Forward Grad (w.r.t max_num_runs = 2) + eager_test::CompareTensorWithValue(input_tensor0, 16); + // Examine Backward Grad (w.r.t max_num_runs = 2) + eager_test::CompareGradTensorWithValue(X, 16); + eager_test::CompareGradTensorWithValue(Y, 16); + } +} + /* ----------------------------------- */ /* ---- Eager Intermediate Matmul ---- */ /* ----------------------------------- */ diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h index 0086b51b57e152c6da935eacba8d93c0d6ab1a71..86bf13707ed40b0c37ccb54695cca3d165768cb6 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h @@ -51,15 +51,10 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor, bool accuracy_check = false); /* ---- Eager MatMul ---- */ -/* -void benchmark_eager_matmul(const paddle::experimental::Tensor& X, const -paddle::experimental::Tensor& Y, +void benchmark_eager_matmul(const paddle::experimental::Tensor& X, + const paddle::experimental::Tensor& Y, bool accuracy_check = false); -void benchmark_eager_mlp(const paddle::experimental::Tensor& X, - const std::vector& Ws, - const std::vector& Bs, - bool accuracy_check = false); -*/ + void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X, const paddle::experimental::Tensor& Y, bool accuracy_check = false); diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt index dbdb52eb53655201ac06b1362c9776ba98bba3eb..c65ad4641cf2206cc0f97d91f1fb24e50b7b63cd 100644 --- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt +++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt @@ -6,7 +6,7 @@ cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} ea cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) -if(NOT ON_INFER) +if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) cc_test(test_egr_task_hook_intermidiate SRCS hook_test_intermidiate.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} dygraph_node) cc_test(test_egr_task_autocodegen SRCS generated_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps}) endif() diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc index a4bc56bd606f3fbb0f9152d58acb5c8edeecf905..0c894ed267fcdd08d44d4df08bfaf0554874aebf 100644 --- a/paddle/fluid/eager/tests/task_tests/backward_test.cc +++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc @@ -30,6 +30,10 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace egr { TEST(Backward, SingleNodeEmptyGrad) { diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc index 524872b2e55638d25697388aa50724f49f6e3818..36594f1aac8cdb131bb77f1396dca19a0c2e8cc0 100644 --- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc +++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc @@ -31,6 +31,10 @@ #include "paddle/fluid/eager/tests/test_utils.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace egr { TEST(CrossBatchAccumulation, SingleScaleNode) { diff --git a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc index 49bbfc77741a5b82ac9a564e25b484e5dabf77a7..dc44d95daac1d9109bbf2a1d04a8a47b081cead9 100644 --- a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc +++ b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc @@ -27,6 +27,10 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace egr { TEST(Forward, SingleNode) { diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc index 5a7bafb2fe37051c0ad054c130d77dd6e05319d2..f7fa642ea8dd17d20816e74c9bfb4cd92b184b4a 100644 --- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc +++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc @@ -30,6 +30,13 @@ #include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/tests/test_utils.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT); +#endif + namespace egr { paddle::experimental::Tensor hook_function( diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc index 4b7077b13bdd6c48a0a3846656bd3a6337eb9f80..2a5ad53204a6201149bec0b3dac0fa3baf441f2e 100644 --- a/paddle/fluid/eager/tests/task_tests/generated_test.cc +++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc @@ -30,6 +30,12 @@ #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" #include "paddle/phi/core/kernel_registry.h" +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); + namespace egr { TEST(Generated, Sigmoid) { diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc index 9cda961741f55e9b4b7fc8dac61fe4a7c96567cf..d546df4ed087a99a28096a5336fab3826991534a 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc @@ -31,6 +31,10 @@ #include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/tests/test_utils.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace egr { paddle::experimental::Tensor hook_function( diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc index 15b2a62dca751859882e82d46acaa46f27c2c518..56813c498d2410caa452da7a334c393b230c65bf 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc @@ -27,6 +27,12 @@ #include "paddle/fluid/eager/hooks.h" #include "paddle/phi/core/kernel_registry.h" +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); + namespace egr { paddle::experimental::Tensor hook_function( diff --git a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc index ea821d195099f3d632e0d1b2d4937bac812563c8..24e5da060111f083ef9b65574e75295fa07f8f43 100644 --- a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc +++ b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc @@ -23,6 +23,10 @@ #include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace egr { TEST(TensorUtils, Test) { diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h new file mode 100644 index 0000000000000000000000000000000000000000..6f8bccd64e45f015a5c1aed44fbfdfc6f68660f1 --- /dev/null +++ b/paddle/fluid/eager/to_static/run_program_op_func.h @@ -0,0 +1,82 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/eager_tensor.h" +#include "paddle/fluid/eager/to_static/run_program_op_node.h" +#include "paddle/fluid/eager/utils.h" + +inline void run_program_dygraph_function( + const std::vector& x, + const std::vector& params, + std::vector& out, // NOLINT + std::vector& step_scope, // NOLINT + std::vector& dout, // NOLINT + const paddle::framework::AttributeMap& attrs) { + VLOG(2) << "start run run_program"; + // Call forward function + RunProgramAPI(x, params, out, step_scope, dout, attrs); + VLOG(2) << "start run run_program grad"; + + // Prepare Autograd Meta + auto deref_out = details::DereferenceTensors(out); + std::vector p_autograd_x = + egr::EagerUtils::nullable_autograd_meta(x); + std::vector p_autograd_params = + egr::EagerUtils::nullable_autograd_meta(params); + std::vector p_autograd_outs = + egr::EagerUtils::nullable_autograd_meta(deref_out); + + bool trace_backward = egr::Controller::Instance().HasGrad(); + bool require_any_grad = egr::EagerUtils::ComputeRequireGrad( + trace_backward, &p_autograd_x, &p_autograd_params); + + if (require_any_grad) { + std::vector out_names; + for (auto& t : deref_out) { + out_names.emplace_back(t.name()); + } + + egr::EagerUtils::PassStopGradient(false, &p_autograd_outs); + // Create GradOpNode (1 means [out_grad], 2 means [x_grad, paramx_grad]) + auto grad_node = std::make_shared(1, 2); + + grad_node->SetFwdOutNames(out_names); + // Set Attributes + grad_node->SetAttrMap(attrs); + // Set TensorWrappers + grad_node->SetFwdX(x); + grad_node->SetFwdParams(params); + grad_node->SetStepScope(step_scope); + + // Set Grad out rank as same as fwd input and set stop gradient to bwd + grad_node->SetGradOutMeta(&p_autograd_x, /*slot id*/ 0); + grad_node->SetGradOutMeta(&p_autograd_params, /*slot id*/ 1); + + grad_node->SetGradInMeta(&p_autograd_outs, 0); + // Set Next Edges + grad_node->AddEdges(&p_autograd_x, /*slot id*/ 0); + grad_node->AddEdges(&p_autograd_params, /*slot id*/ 1); + + egr::EagerUtils::SetOutRankWithSlot(&p_autograd_outs, 0); + + // Set History for output set current Grad Node for + egr::EagerUtils::SetHistory(&p_autograd_outs, grad_node); + egr::EagerUtils::CheckAndRetainGrad(deref_out); + } +} diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h new file mode 100644 index 0000000000000000000000000000000000000000..ae5d86664a346fd8a1d877f9e1dd74f687302595 --- /dev/null +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -0,0 +1,468 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/tensor_wrapper.h" + +#include "paddle/fluid/operators/run_program_op.h" +#include "paddle/fluid/platform/enforce.h" + +namespace details { +using Tensor = paddle::experimental::Tensor; + +static std::vector DereferenceTensors( + const std::vector &tensor_ptr) { + std::vector res; + for (auto *t : tensor_ptr) { + res.emplace_back(*t); + } + return res; +} + +static std::vector GetTensorsName(const std::vector &ins) { + std::vector in_names; + for (auto &in_t : ins) { + in_names.emplace_back(in_t.name()); + } + return in_names; +} + +static std::vector GetTensorsName( + const std::vector &ins) { + std::vector in_names; + for (auto *in_t : ins) { + in_names.emplace_back(in_t->name()); + } + return in_names; +} + +static void CheckInputVarStatus(const Tensor &tensor) { + PADDLE_ENFORCE_EQ( + tensor.defined() && phi::DenseTensor::classof(tensor.impl().get()), true, + paddle::platform::errors::InvalidArgument( + "The input tensor %s of " + "RunProgram(Grad)Op holds " + "wrong type. Expect type is DenseTensor.", + tensor.name())); + + PADDLE_ENFORCE_EQ(tensor.initialized(), true, + paddle::platform::errors::InvalidArgument( + "The tensor in input tensor %s of " + "RunProgram(Grad)Op " + "is not initialized.", + tensor.name())); +} + +static void CheckOutputVarStatus(const paddle::framework::Variable &src_var, + const Tensor &dst_tensor) { + auto name = dst_tensor.name(); + PADDLE_ENFORCE_EQ(dst_tensor.defined(), true, + paddle::platform::errors::InvalidArgument( + "dst_tensor shall be defined.")); + + if (phi::DenseTensor::classof(dst_tensor.impl().get())) { + auto &src_tensor = src_var.Get(); + PADDLE_ENFORCE_EQ(phi::DenseTensor::classof(&src_tensor), true, + paddle::platform::errors::InvalidArgument( + "The output tensor %s get from " + "RunProgram(Grad)Op's internal scope holds " + "wrong type. Expect type is DenseTensor", + name)); + PADDLE_ENFORCE_EQ(src_tensor.initialized(), true, + paddle::platform::errors::InvalidArgument( + "The tensor in output tensor %s get from " + "RunProgram(Grad)Op's internal " + "scope is not initialized.", + name)); + } else if (phi::SelectedRows::classof(dst_tensor.impl().get())) { + auto &src_tensor = src_var.Get(); + PADDLE_ENFORCE_EQ(phi::SelectedRows::classof(&src_tensor), true, + paddle::platform::errors::InvalidArgument( + "The output tensodfr %s get from " + "RunProgram(Grad)Op's internal scope holds " + "wrong type. Expect type is SelectedRows", + name)); + PADDLE_ENFORCE_EQ(src_tensor.initialized(), true, + paddle::platform::errors::InvalidArgument( + "The tensor in output tensor %s get from " + "RunProgram(Grad)Op's " + "internal scope is not initialized.", + name)); + + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "The RunProgram(Grad)Op only support output " + "variable of type LoDTensor or SelectedRows", + name)); + } +} + +static void ShareTensorsIntoScope(const std::vector &tensors, + paddle::framework::Scope *scope) { + for (size_t i = 0; i < tensors.size(); ++i) { + auto name = tensors[i].name(); + if (name == "Fake_var" || !tensors[i].is_initialized()) { + continue; + } + auto *var = scope->Var(name); + CheckInputVarStatus(tensors[i]); + // share tensor + auto tensor_base = tensors[i].impl(); + if (phi::DenseTensor::classof(tensor_base.get())) { + auto *dst_tensor = var->GetMutable(); + auto t = std::dynamic_pointer_cast(tensor_base); + *dst_tensor = *t; + } else if (phi::SelectedRows::classof(tensor_base.get())) { + auto *dst_tensor = var->GetMutable(); + auto t = std::dynamic_pointer_cast(tensor_base); + *dst_tensor = *t; + } + } +} + +static void ShareTensorsFromScope( + const std::vector &tensors, + const paddle::framework::BlockDesc &global_block, + paddle::framework::Scope *scope) { + for (size_t i = 0; i < tensors.size(); ++i) { + // NOTE: In case of setting out_tmp.stop_gradient = True in model code, all + // parameters before generating out_tmp have no @GRAD, it will raise error + // because we can't find them in scope. So we skip sharing these vars or + // var@GRAD if they don't appear in global block. + auto &name = tensors[i]->name(); + if (name == paddle::framework::kEmptyVarName || name == "Fake_var" || + !global_block.HasVar(name)) { + VLOG(2) << "find tensor name is " << name << ", skip it!"; + continue; + } + // NOTE: Here skip not found var is dangerous, if a bug is caused here, + // the result is grad calculation error, which will be very hidden! + auto *var = scope->FindVar(name); + PADDLE_ENFORCE_NOT_NULL(var, paddle::platform::errors::NotFound( + "The output tensor %s is not in " + "RunProgram(Grad)Op'" + "s internal scope.", + name)); + CheckOutputVarStatus(*var, *tensors[i]); + // share tensor + // TODO(dev): Determine Tensor type by scope.var + // auto tensor_base = tensors[i]->impl(); + // if (phi::DenseTensor::classof(tensor_base.get())) { + if (var->IsType()) { + auto &src_tensor = var->Get(); + auto *dst_tensor = const_cast( + dynamic_cast(tensors[i]->impl().get())); + VLOG(2) << "share " << name << " from scope"; + *dst_tensor = src_tensor; + } else if (var->IsType()) { + // } else if (phi::SelectedRows::classof(tensor_base.get())) { + auto &src_tensor = var->Get(); + auto *dst_tensor = const_cast( + dynamic_cast(tensors[i]->impl().get())); + *dst_tensor = src_tensor; + } + } +} + +} // namespace details + +inline void RunProgramAPI( + const std::vector &x, + const std::vector ¶ms, + std::vector &out, // NOLINT + std::vector &step_scope, // NOLINT + std::vector &dout, // NOLINT + const paddle::framework::AttributeMap &attrs) { + VLOG(2) << "RunProgramOpKernel Compute"; + auto start_op_index = BOOST_GET_CONST(int64_t, attrs.at("start_op_index")); + auto end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index")); + auto is_test = BOOST_GET_CONST(bool, attrs.at("is_test")); + auto program_id = BOOST_GET_CONST(int64_t, attrs.at("program_id")); + + // NOTE(chenweihang): In order not to add new variable type, use vector + // here. Originally, here can use scope directly. + auto *out_scope_vec = &step_scope; + PADDLE_ENFORCE_EQ( + out_scope_vec->size(), 1, + paddle::platform::errors::InvalidArgument( + "The OutScope of RunProgramGradOp should only hold one scope.")); + + // Step 2. prepare executor and init persistable variables + + // NOTE(Aurelius84): While training some models, forward can be called many + // times and then apply backpropagation all at once, such as Reinforcement + // Learning. Tensor data in multi-step training should be saved into single + // scope separately. Otherwise, the gradients can be miscalculated because + // always using the Tensor data of the last step in forward. + paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); + VLOG(2) << "The number of sub scopes before forward: " + << out_scope_vec->front()->kids().size(); + paddle::framework::Scope &scope = global_inner_scope->NewScope(); + + // share input_vars & parameters into scope + details::ShareTensorsIntoScope(x, &scope); + details::ShareTensorsIntoScope(params, &scope); + + auto *global_block = + BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block")); + const auto &place = egr::Controller::Instance().GetExpectedPlace(); + + if (end_op_index > start_op_index) { + auto input_names = details::GetTensorsName(x); + auto output_names = details::GetTensorsName(out); + auto dout_names = details::GetTensorsName(dout); + auto *program = global_block->Program(); + + auto cache_info = paddle::framework::GetExecutorInfoFromCache( + *program, place, start_op_index, end_op_index, + /*is_grad=*/false, program_id, &scope); + auto ¶llel_executor = cache_info.first; + // all out_vars are skip_eager_var + auto &skip_eager_delete_vars = + paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( + program_id, false); + if (cache_info.second /*is_new_created*/) { + parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, input_names); + skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), + output_names.begin(), output_names.end()); + skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), + dout_names.begin(), dout_names.end()); + paddle::framework::details::ParseSafeEagerDeletionSkipVars( + *program, end_op_index, output_names, &skip_eager_delete_vars); + } + + // Step 3. run ops + parallel_executor->RunWithoutFetch(skip_eager_delete_vars); + } + // Step 4. Get Output + details::ShareTensorsFromScope(out, *global_block, &scope); + details::ShareTensorsFromScope(dout, *global_block, &scope); + + // Debug info: scope info when run end + VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front()); + // Step 5. Drop all children scopes while testing. + if (is_test) { + out_scope_vec->front()->DropKids(); + } + VLOG(2) << "The number of sub scopes after forward: " + << out_scope_vec->front()->kids().size(); + // #ifdef PADDLE_WITH_MKLDNN + // if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place); + // #endif +} + +inline void RunProgramGradAPI( + const std::vector &x, + const std::vector ¶ms, + const std::vector &out_grad, + const std::vector &step_scope, // NOLINT + const paddle::framework::AttributeMap &attrs, + std::vector &x_grad, // NOLINT + std::vector ¶ms_grad // NOLINT + ) { + // if all output vars are set to stop_gradient, grad op no need to executed + if (x_grad.empty() && params_grad.empty()) return; + + // TODO(dev): Remove this line hard code. And need to deal with the out_grad + // name problem. + // const_cast(out_grad[0]) + // .set_name("matmul_v2_0.tmp_0@GRAD"); + + auto *global_block = + BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block")); + auto orig_end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index")); + + auto program_id = BOOST_GET_CONST(int64_t, attrs.at("program_id")); + // NOTE: skip `shape` and `fill_constant` op created by + // fluid.backward.gradients, one forward output will generate one `shape` + // and `fill_constant` + int64_t start_op_index = orig_end_op_index + (out_grad.size() * 2); + int64_t end_op_index = global_block->OpSize(); + + auto *out_scope_vec = &step_scope; + PADDLE_ENFORCE_EQ( + out_scope_vec->size(), 1, + paddle::platform::errors::InvalidArgument( + "The OutScope of RunProgramGradOp should only hold one scope.")); + + paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); + auto sub_scope_num = global_inner_scope->kids().size(); + VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num; + PADDLE_ENFORCE_GT(sub_scope_num, 0, + paddle::platform::errors::InvalidArgument( + "The OutScope of RunProgramGradOp should hold at " + "least one sub scope.")); + + auto &scope = *(global_inner_scope->kids().front()); + const auto &place = egr::Controller::Instance().GetExpectedPlace(); + + if (end_op_index > start_op_index) { + auto out_grad_names = details::GetTensorsName(out_grad); + // NOTE: after PR22939 [Add double grad] merged, the grad op maker's + // SetOutput will set to None if the input var stop_gradient=True, + // it will cause an NotFound error when ctx.OutputNames() is called + std::vector x_grad_names; + std::vector param_grad_names; + if (!x_grad.empty()) { + x_grad_names = details::GetTensorsName(x_grad); + } + if (!params_grad.empty()) { + param_grad_names = details::GetTensorsName(params_grad); + } + + // Step 2. prepare executor and scope + auto *program = global_block->Program(); + auto cache_info = paddle::framework::GetExecutorInfoFromCache( + *program, place, start_op_index, end_op_index, + /*is_grad*/ true, program_id, &scope); + auto ¶llel_executor = cache_info.first; + + auto &skip_eager_delete_vars = + paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( + program_id, true); + if (cache_info.second /*is_new_created*/) { + parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, out_grad_names); + + skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), + x_grad_names.begin(), x_grad_names.end()); + paddle::framework::details::AppendSkipDeletionVars( + param_grad_names, &skip_eager_delete_vars); + } + + details::ShareTensorsIntoScope(out_grad, &scope); + // Debug info: scope info when run end + VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front()); + + // Step 3. run ops + parallel_executor->RunWithoutFetch( + /*skip_eager_delete_vars=*/skip_eager_delete_vars); + } + + // Step 4. get outputs + details::ShareTensorsFromScope(x_grad, *global_block, &scope); + details::ShareTensorsFromScope(params_grad, *global_block, &scope); + + // Step5. drop current scope + // global_inner_scope->DeleteScope(&scope); + VLOG(2) << "The number of sub scopes after backward: " + << global_inner_scope->kids().size(); +} + +class GradNodeRunProgram : public egr::GradNodeBase { + public: + GradNodeRunProgram(size_t bwd_in_slot_num, size_t bwd_out_slot_num) + : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {} + + ~GradNodeRunProgram() override = default; + // Functor: perform backward computations + virtual std::vector> operator()( + const std::vector> &grads) + override { + VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram"; + PADDLE_ENFORCE_EQ( + grads.size(), 1, + paddle::platform::errors::InvalidArgument( + "The out_grads.size() of RunProgramGradOp should be equal to 1.")); + + VLOG(3) << "out_grads[0].size() : " << grads[0].size(); + std::vector x_grad; + std::vector params_grad; + ConstructGradTensors(x_, &x_grad); + ConstructGradTensors(params_, ¶ms_grad); + std::vector x_grad_ptr; + std::vector params_grad_ptr; + for (auto &i : x_grad) { + x_grad_ptr.emplace_back(&i); + } + for (auto &i : params_grad) { + params_grad_ptr.emplace_back(&i); + } + + // auto x_grad_ptr = ConstructGradTensors(x_); + // auto params_grad_ptr = ConstructGradTensors(params_); + + PADDLE_ENFORCE_EQ( + grads[0].size(), fwd_out_names_.size(), + paddle::platform::errors::InvalidArgument( + "The grads[0].size() and fwd_out_names_.size() should be equal.")); + for (size_t i = 0; i < fwd_out_names_.size(); ++i) { + const_cast(grads[0][i]) + .set_name(fwd_out_names_[i] + "@GRAD"); + } + + RunProgramGradAPI(x_, params_, grads[0], step_scope_, attrs_, x_grad_ptr, + params_grad_ptr); + VLOG(3) << "End Eager Backward Node: GradNodeRunProgram"; + return {x_grad, params_grad}; + // return {x_grad, details::DereferenceTensors(params_grad_ptr)}; + } + + // SetAttrMap + void SetAttrMap(const paddle::framework::AttributeMap &attrs) { + attrs_ = attrs; + } + + void SetFwdX(const std::vector &tensors) { + x_ = tensors; + } + + void SetFwdParams(const std::vector &tensors) { + params_ = tensors; + } + + void SetStepScope(const std::vector &scopes) { + step_scope_ = scopes; + } + + void SetFwdOutNames(std::vector out_names) { + fwd_out_names_ = out_names; + } + + protected: + void ConstructGradTensors( + const std::vector &fwd_tensors, + std::vector *grad_tensors) { + // TODO(dev): Need an elegant way to determine inforamtion of grad_tensor, + // such as: name, tensor type(DenseTensor or SelectedRows). + VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size(); + for (auto &fwd_t : fwd_tensors) { + grad_tensors->emplace_back(fwd_t.impl()); + auto &grad_t = grad_tensors->back(); + grad_t.set_name(fwd_t.name() + "@GRAD"); + } + } + + void ConstructGradTensors( + const std::vector &fwd_tensors) { + VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size(); + for (auto &fwd_t : fwd_tensors) { + auto grad_tesnor = egr::EagerUtils::unsafe_autograd_meta(fwd_t)->Grad(); + grad_tesnor.set_name(fwd_t.name() + "@GRAD"); + } + } + + private: + // TensorWrappers + std::vector x_; + std::vector params_; + std::vector step_scope_; + + std::vector fwd_out_names_; + + // Attribute Map + paddle::framework::AttributeMap attrs_; +}; diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index a7e5931f1f9bc66006fb1a37836be1eda371953e..8a57d2694535e9c27e88416468fe5a67ce020b43 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -122,12 +122,22 @@ paddle::experimental::Tensor* EagerUtils::mutable_grad( void EagerUtils::SetHistory(std::vector* autograd_metas, const std::shared_ptr& grad_node) { for (const auto& autograd_meta : *autograd_metas) { + if (autograd_meta->GradNode()) { + VLOG(7) << "Should not set grad node twice, original node is:" + << autograd_meta->GradNode()->name() + << "current is: " << grad_node->name(); + } autograd_meta->SetGradNode(grad_node); } } void EagerUtils::SetHistory(AutogradMeta* autograd_meta, const std::shared_ptr& grad_node) { + if (autograd_meta->GradNode()) { + VLOG(7) << "Should not set grad node twice, original node is:" + << autograd_meta->GradNode()->name() + << "current is: " << grad_node->name(); + } autograd_meta->SetGradNode(grad_node); } diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt old mode 100644 new mode 100755 index 14aecb5fd43c49ece1f79cb9c8e2b70e9d07df07..aa92a3b2226c1fca1fa7326e76ef29b0b38cd8d6 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -235,6 +235,7 @@ if(WITH_PYTHON) py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto) py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto) py_proto_compile(pass_desc_py_proto SRCS pass_desc.proto) + py_proto_compile(ps_py_proto SRCS ps.proto) #Generate an empty \ #__init__.py to make framework_py_proto as a valid python module. add_custom_target(fleet_proto_init ALL @@ -242,12 +243,13 @@ if(WITH_PYTHON) COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py ) add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) - add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init pass_desc_py_proto) + add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init pass_desc_py_proto ps_py_proto) if (NOT WIN32) add_custom_command(TARGET framework_py_proto POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/ COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto + COMMAND cp ps_pb2.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto COMMENT "Copy generated python proto into directory paddle/fluid/proto." WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) add_custom_target(fleet_executor_proto_init ALL DEPENDS fleet_proto_init fleet_executor_desc_py_proto @@ -259,6 +261,7 @@ if(WITH_PYTHON) add_custom_command(TARGET framework_py_proto POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto COMMAND copy /Y *.py ${proto_dstpath} + COMMAND copy /Y ps_pb2.py ${fleet_proto_dstpath} COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath} COMMENT "Copy generated python proto into directory paddle/fluid/proto." COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto." @@ -437,11 +440,10 @@ message(STATUS "branch: ${PADDLE_BRANCH}") configure_file(commit.h.in commit.h) cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper phi_tensor op_meta_info phi_api) -cc_library(custom_kernel SRCS custom_kernel.cc DEPS op_registry phi_custom_kernel phi_tensor_raw) #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ) #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) -set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator custom_kernel) +set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator) cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES}) diff --git a/paddle/fluid/framework/custom_kernel.cc b/paddle/fluid/framework/custom_kernel.cc deleted file mode 100644 index 49a1e0774a6b1a7a1afd154029850ceb52040759..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/custom_kernel.cc +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#if defined _WIN32 || defined __APPLE__ -#else -#define _LINUX -#endif - -#include "paddle/fluid/framework/custom_kernel.h" -#include "paddle/phi/core/custom_kernel.h" - -namespace paddle { -namespace framework { - -void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle) { -#ifdef _LINUX - typedef phi::CustomKernelMap& get_custom_kernel_map_t(); - auto* func = reinterpret_cast( - dlsym(dso_handle, "PD_GetCustomKernelMap")); - - if (func == nullptr) { - LOG(WARNING) << "Skipped lib [" << dso_lib_path << "]: fail to find " - << "PD_GetCustomKernelMap symbol in this lib."; - return; - } - auto& custom_kernel_map = func(); - phi::RegisterCustomKernels(custom_kernel_map); - LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path; -#else - VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux."; -#endif - return; -} - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 66dfb81755f1c9cc16ab8a52df429af8d94ab718..948eaab40b4f64f2a87a83fab80d4eade5288e91 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -139,7 +139,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass - fix_op_run_order_pass) + fix_op_run_order_pass fuse_gemm_epilogue_pass) if (WITH_CINN) set(IR_PASS_DEPS ${IR_PASS_DEPS} build_cinn_pass) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index c99200ec98aa8f0736610f659d3b94e3c2f1e023..fdf74d2f769fcdd49da19c0118a23d6b8fbb06e4 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -1,4 +1,5 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -175,6 +176,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { !defined(_WIN32) && !defined(__APPLE__) AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass"); #endif + +#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060) + AppendPassWithCheck(strategy_.fuse_gemm_epilogue_, + "fuse_gemm_epilogue_pass"); +#endif AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_, "fuse_elewise_add_act_pass"); // for single card training, fuse_all_reduce_ops is unnecessary. @@ -507,3 +513,6 @@ USE_PASS(mkldnn_placement_pass); !defined(_WIN32) && !defined(__APPLE__) USE_PASS(fusion_group_pass); #endif +#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060) +USE_PASS(fuse_gemm_epilogue_pass); +#endif diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 70a083dd70bc3b48bf24b050673f3da7b69b1755..5eb584aaefa981ab6c6f25df7a765ae9a3d0194a 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -1,4 +1,5 @@ // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -124,6 +125,8 @@ struct BuildStrategy { paddle::optional fuse_broadcast_ops_{paddle::none}; // replace batch_norm with sync_batch_norm. bool sync_batch_norm_{false}; + // Fuse GEMM+Epilogue via cublasLt epilogue. + bool fuse_gemm_epilogue_{false}; // mkldnn_enabled_op_types specify the operator type list to // use MKLDNN acceleration. It is null in default, means diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h index b7cb2ce0f0102bd34940864960118f396c5dcad7..59220fc9cdaf1f05f70e8cfe961071c1fad3a760 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h @@ -186,45 +186,63 @@ void HashTable::insert(const KeyType* d_keys, size_t len, template void HashTable::dump_to_cpu(int devid, cudaStream_t stream) { container_->prefetch(cudaCpuDeviceId, stream); + std::vector threads; size_t num = container_->size(); KeyType unuse_key = std::numeric_limits::max(); thrust::pair* kv = container_->data(); - for (size_t i = 0; i < num; ++i) { - if (kv[i].first == unuse_key) { - continue; - } - ValType& gpu_val = kv[i].second; + + int thread_num = 8; + int len_per_thread = num / thread_num; + int remain = num % thread_num; + int begin = 0; + + auto dump_func = [unuse_key, kv](int left, int right) { + for (int i = left; i < right; i++) { + if (kv[i].first == unuse_key) { + continue; + } + ValType& gpu_val = kv[i].second; #ifdef PADDLE_WITH_PSLIB - auto* downpour_value = - (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr); - int downpour_value_size = downpour_value->size(); - if (gpu_val.mf_size > 0 && downpour_value_size == 7) { - downpour_value->resize(gpu_val.mf_size + downpour_value_size); - } - float* cpu_val = downpour_value->data(); - // cpu_val[0] = 0; - cpu_val[1] = gpu_val.delta_score; - cpu_val[2] = gpu_val.show; - cpu_val[3] = gpu_val.clk; - cpu_val[4] = gpu_val.lr; - cpu_val[5] = gpu_val.lr_g2sum; - cpu_val[6] = gpu_val.slot; - if (gpu_val.mf_size > 0) { - for (int x = 0; x < gpu_val.mf_size; x++) { - cpu_val[x + 7] = gpu_val.mf[x]; + auto* downpour_value = + (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr); + int downpour_value_size = downpour_value->size(); + if (gpu_val.mf_size > 0 && downpour_value_size == 7) { + downpour_value->resize(gpu_val.mf_size + downpour_value_size); + } + float* cpu_val = downpour_value->data(); + // cpu_val[0] = 0; + cpu_val[1] = gpu_val.delta_score; + cpu_val[2] = gpu_val.show; + cpu_val[3] = gpu_val.clk; + cpu_val[4] = gpu_val.lr; + cpu_val[5] = gpu_val.lr_g2sum; + cpu_val[6] = gpu_val.slot; + if (gpu_val.mf_size > 0) { + for (int x = 0; x < gpu_val.mf_size; x++) { + cpu_val[x + 7] = gpu_val.mf[x]; + } } - } #endif #ifdef PADDLE_WITH_PSCORE - auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr); - downpour_value->count_ = gpu_val.show; - for (int x = 0; x < gpu_val.mf_size; x++) { - downpour_value->data_[x] = gpu_val.mf[x]; - } + auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr); + downpour_value->count_ = gpu_val.show; + for (int x = 0; x < gpu_val.mf_size; x++) { + downpour_value->data_[x] = gpu_val.mf[x]; + } #endif + } + }; + + for (int i = 0; i < thread_num; i++) { + threads.push_back(std::thread( + dump_func, begin, begin + len_per_thread + (i < remain ? 1 : 0))); + begin += len_per_thread + (i < remain ? 1 : 0); + } + for (std::thread& t : threads) { + t.join(); } - container_->prefetch(devid, stream); + // container_->prefetch(devid, stream); } template diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index 9f2bdeffecf62764f5cbe5bea9cb50d4830be43b..c1f8041cc1eca34b858608ffb77598ce095d0b4f 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -231,19 +231,19 @@ void CustomDeviceUnsafeFastGarbageCollector::ClearCallback( CustomStreamGarbageCollector::CustomStreamGarbageCollector( const platform::CustomPlace &place, size_t max_memory_size) : GarbageCollector(place, max_memory_size) { - platform::DeviceGuard guard(place); - stream_.reset(new platform::stream::Stream); + phi::DeviceGuard guard(place); + stream_.reset(new phi::stream::Stream); stream_->Init(place); - callback_manager_.reset(new platform::CallbackManager(stream_.get())); + callback_manager_.reset(new phi::CallbackManager(stream_.get())); } CustomStreamGarbageCollector::~CustomStreamGarbageCollector() { - platform::DeviceGuard guard(this->dev_ctx_->GetPlace()); + phi::DeviceGuard guard(this->dev_ctx_->GetPlace()); stream_->Synchronize(); stream_->Destroy(); } -platform::stream::Stream *CustomStreamGarbageCollector::stream() const { +phi::stream::Stream *CustomStreamGarbageCollector::stream() const { return stream_.get(); } diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index a67860c6087e0f173e09d2a7c131703260c562fd..f0027c676050b8c31c0bc0ca4ab3b6444f29e1a2 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -230,14 +230,14 @@ class CustomStreamGarbageCollector : public GarbageCollector { void Wait() const override; - platform::stream::Stream *stream() const; + phi::stream::Stream *stream() const; protected: void ClearCallback(const std::function &callback) override; private: - std::unique_ptr stream_; - std::unique_ptr callback_manager_; + std::unique_ptr stream_; + std::unique_ptr callback_manager_; }; #endif diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index e14b91d935d05c12442f3d0205c1e97df9697ec3..29c7f5d0ce73cbf1af18e6f5869d59d2200917ad 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -88,6 +88,10 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext { return var_types[0] == proto::VarType::SELECTED_ROWS; } + bool IsForInferShape() const override { return true; } + + bool IsRuntime() const override { return ctx_.IsRuntime(); } + private: const InferShapeContext& ctx_; }; @@ -127,7 +131,9 @@ class CompatMetaTensor : public phi::MetaTensor { } } else { auto* var = BOOST_GET_CONST(VarDesc*, var_); - return phi::make_ddim(var->GetShape()); + + return var->GetShape().empty() ? phi::make_ddim({0UL}) + : phi::make_ddim(var->GetShape()); } } @@ -228,16 +234,8 @@ class CompatMetaTensor : public phi::MetaTensor { } } - void share_meta(const MetaTensor& meta_tensor) override { + void share_dims(const MetaTensor& meta_tensor) override { set_dims(meta_tensor.dims()); - set_dtype(meta_tensor.dtype()); - // VarDesc doesn't contains layout, so we cannot share layout - // set_layout(meta_tensor.layout()); - - // special case 1: share lod of LoDTensor - share_lod(meta_tensor); - - // special case 2: share height and rows of SelectedRows in runtime if (is_runtime_) { auto* var = BOOST_GET(Variable*, var_); if (var->IsType()) { @@ -250,6 +248,16 @@ class CompatMetaTensor : public phi::MetaTensor { } } + void share_meta(const MetaTensor& meta_tensor) override { + set_dtype(meta_tensor.dtype()); + // VarDesc doesn't contains layout, so we cannot share layout + // set_layout(meta_tensor.layout()); + + // special case 1: share lod of LoDTensor + share_lod(meta_tensor); + share_dims(meta_tensor); + } + private: const LoD& GetRuntimeLoD() const { auto* var = BOOST_GET_CONST(Variable*, var_); @@ -308,22 +316,25 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, // TODO(chenweihang): support multiple inputs and outputs later phi::InferMetaContext infer_mete_context; for (auto& in_name : input_names) { - if (ctx->HasInput(in_name)) { - infer_meta_context.EmplaceBackInput(std::make_shared( - ctx->GetInputVarPtrs(in_name)[0], ctx->IsRuntime())); + if (ctx->HasInputs(in_name)) { + auto input_var = ctx->GetInputVarPtrs(in_name); + if (input_var.size() == 1) { + infer_meta_context.EmplaceBackInput( + std::make_shared(input_var[0], ctx->IsRuntime())); + } else { + paddle::SmallVector> inputs; + inputs.reserve(input_var.size()); + for (const auto& in : input_var) { + inputs.push_back( + std::make_shared(in, ctx->IsRuntime())); + } + infer_meta_context.EmplaceBackInputs(std::move(inputs)); + } } else { infer_meta_context.EmplaceBackInput({nullptr}); } } - for (auto& out_name : output_names) { - if (ctx->HasOutput(out_name)) { - infer_meta_context.EmplaceBackOutput(std::make_shared( - ctx->GetOutputVarPtrs(out_name)[0], ctx->IsRuntime())); - } else { - infer_meta_context.EmplaceBackOutput({nullptr}); - } - } auto attr_reader = ctx->Attrs(); for (size_t i = 0; i < attr_names.size(); ++i) { auto attr_name = attr_names[i]; @@ -348,13 +359,13 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, } } else { // If is not in runtime, we will set default value(-1) for ScalarArray - int64_t num_ele = 0; std::vector vars; vars.reserve(infershape_inputs.size()); - for (size_t i = 0; i < infershape_inputs.size(); i++) { + for (size_t i = 0; i < infershape_inputs.size(); ++i) { vars.push_back(BOOST_GET_CONST(VarDesc*, infershape_inputs[i])); } + int64_t num_ele = 0; if (vars.size() == 1) { num_ele = 1; const auto& tensor_dims = vars[0]->GetShape(); @@ -362,16 +373,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, num_ele *= tensor_dims[i]; } } else { - for (auto& var : vars) { - const auto& tensor_dims = var->GetShape(); - PADDLE_ENFORCE_EQ(tensor_dims.size(), 1, - platform::errors::InvalidArgument( - "The shape is constructed by multi-tensor, " - "every tensor's dims should be 1. But your " - "shape has tensor that dims is %s.", - tensor_dims.size())); - num_ele += tensor_dims[0]; - } + num_ele = vars.size(); } phi::ScalarArray tensor_attr(std::vector(num_ele, -1)); tensor_attr.SetFromTensor(true); @@ -383,10 +385,18 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, std::type_index(typeid(std::vector))) { infer_meta_context.EmplaceBackAttr(std::move( phi::ScalarArray(BOOST_GET_CONST(std::vector, attr)))); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + infer_meta_context.EmplaceBackAttr(std::move( + phi::ScalarArray(BOOST_GET_CONST(std::vector, attr)))); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(int))) { + infer_meta_context.EmplaceBackAttr( + phi::ScalarArray({BOOST_GET_CONST(int, attr)})); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` to ScalarArray when " - "construct KernelContext.", + "construct InferMetaContext.", attr_name)); } } @@ -414,7 +424,6 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, } } else if (ctx->HasInput(attr_name)) { const auto& infershape_input = ctx->GetInputVarPtrs(attr_name); - if (infershape_input.size() == 1) { if (ctx->IsRuntime()) { Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]); @@ -490,6 +499,28 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, "Unsupported attribute type is received when call " "InferShapeFunctor.")); } + } else { + // do nothing + } + } + + for (auto& out_name : output_names) { + if (ctx->HasOutputs(out_name)) { + auto output_var = ctx->GetOutputVarPtrs(out_name); + if (output_var.size() == 1) { + infer_meta_context.EmplaceBackOutput(std::make_shared( + output_var[0], ctx->IsRuntime())); + } else { + paddle::SmallVector> outputs; + outputs.reserve(output_var.size()); + for (const auto& out : output_var) { + outputs.emplace_back( + std::make_shared(out, ctx->IsRuntime())); + } + infer_meta_context.EmplaceBackOutputs(std::move(outputs)); + } + } else { + infer_meta_context.EmplaceBackOutput({nullptr}); } } diff --git a/paddle/fluid/framework/infershape_utils.h b/paddle/fluid/framework/infershape_utils.h index 64c8371d583ffef621e5009504d14308dd7b997c..b692b6ffab08014f7de6ef4e5488445204396853 100644 --- a/paddle/fluid/framework/infershape_utils.h +++ b/paddle/fluid/framework/infershape_utils.h @@ -29,7 +29,7 @@ namespace framework { phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, const std::string& op_type); -#define DELCARE_INFER_SHAPE_FUNCTOR(op_type, functor_name, fn) \ +#define DECLARE_INFER_SHAPE_FUNCTOR(op_type, functor_name, fn) \ struct functor_name : public paddle::framework::InferShapeBase { \ void operator()( \ paddle::framework::InferShapeContext* ctx) const override { \ diff --git a/paddle/fluid/framework/infershape_utils_test.cc b/paddle/fluid/framework/infershape_utils_test.cc index 53dcc19fcbae88ab5ccfcc498037327946029927..2eeefb19a1aa8c5c9e4f92ff06618c719bb30785 100644 --- a/paddle/fluid/framework/infershape_utils_test.cc +++ b/paddle/fluid/framework/infershape_utils_test.cc @@ -110,9 +110,9 @@ void InferShapeUtilsTestKernel( } // namespace framework } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(infer_shape_utils_test, +DECLARE_INFER_SHAPE_FUNCTOR(infer_shape_utils_test, InferShapeUtilsTestInferShapeFunctor, - PT_INFER_META(paddle::framework::TestInferMeta)); + PD_INFER_META(paddle::framework::TestInferMeta)); REGISTER_OPERATOR(infer_shape_utils_test, paddle::framework::InferShapeUtilsTestOp, paddle::framework::InferShapeUtilsTestOpMaker, diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index dad5358590cb1497453681ce940898314a1d06eb..a1f2d6edca6a2db5d5bb4c8cf896c492f20ed2da 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -78,7 +78,6 @@ pass_library(is_test_pass base) pass_library(conv_elementwise_add_act_fuse_pass inference) pass_library(conv_elementwise_add2_act_fuse_pass inference) pass_library(conv_elementwise_add_fuse_pass inference) -pass_library(conv_affine_channel_fuse_pass inference) pass_library(transpose_flatten_concat_fuse_pass inference) pass_library(identity_scale_op_clean_pass base) pass_library(sync_batch_norm_pass base) @@ -158,6 +157,7 @@ endif() cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_bn_add_act_pass SRCS fuse_bn_add_act_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector ) +cc_library(fuse_gemm_epilogue_pass SRCS fuse_gemm_epilogue_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector ) set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc deleted file mode 100644 index f28c9988bd858ad00a5c5a532b7b484315557d8f..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc +++ /dev/null @@ -1,420 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h" - -#include - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/op_version_registry.h" - -namespace phi { -class DenseTensor; -} // namespace phi - -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace framework { -namespace ir { - -class Node; - -#define GET_CONV_BN_NODES(pattern_name) \ - /* OPERATORS */ \ - GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name); \ - GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \ - /* CONV inputs */ \ - GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name); \ - /* CONV outputs */ \ - GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name); \ - /* Affine Channel inputs */ \ - GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name); \ - GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name); \ - /* Affine channel outputs */ \ - GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */ - -void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight, - const ir::Node& ac_scale, - const LoDTensor& ac_bias_tensor, - LoDTensor* eltwise_y_in_tensor) { - using EigenVectorArrayMap = - Eigen::Map>; - using ConstEigenVectorArrayMap = - Eigen::Map>; - using EigenMatrixArrayMap = Eigen::Map< - Eigen::Array>; - - // Re-compute bias of conv2d from AffineChannel - PADDLE_ENFORCE_EQ( - eltwise_y_in_tensor->dims(), ac_bias_tensor.dims(), - platform::errors::InvalidArgument( - "Tensor elementwise y(%d) and activation bias(%d) must have same " - "dimension.", - eltwise_y_in_tensor->dims().size(), ac_bias_tensor.dims().size())); - - auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable(); - - ConstEigenVectorArrayMap scale_array(scale_tensor->data(), - scale_tensor->numel(), 1); - ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data(), - ac_bias_tensor.numel(), 1); - - EigenVectorArrayMap eltwise_y_in_array( - eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), - eltwise_y_in_tensor->numel(), 1); - - eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array; - - // Re-compute weight of conv2d from AffineChannel - auto* weights = scope->FindVar(conv_weight->Name())->GetMutable(); - auto weights_shape = weights->dims(); - auto weights_shape_2d = phi::flatten_to_2d(weights_shape, 1); - auto* weights_data = weights->mutable_data(platform::CPUPlace()); - - EigenMatrixArrayMap weights_array_2d(weights_data, weights_shape_2d[0], - weights_shape_2d[1]); - - weights_array_2d.colwise() *= scale_array; - - // Check for subnormal values that slows down convolution execution - for (int i = 0; i < weights->numel(); ++i) { - if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0; - } -} - -ConvAffineChannelFusePass::ConvAffineChannelFusePass() { - AddOpCompat(OpCompat("conv2d")) - .AddInput("Input") - .IsTensor() - .End() - .AddInput("Filter") - .IsTensor() - .End() - .AddInput("Bias") - .IsTensor() - .IsOptional() - .End() - .AddInput("ResidualData") - .IsTensor() - .IsOptional() - .End() - .AddOutput("Output") - .IsTensor() - .End() - .AddAttr("strides") - .IsType>() - .End() - .AddAttr("paddings") - .IsType>() - .End() - .AddAttr("padding_algorithm") - .IsOptional() - .IsStringIn({"EXPLICIT", "SAME", "VALID"}) - .End() - .AddAttr("groups") - .IsNumGE(1) - .End() - .AddAttr("dilations") - .IsType>() - .End() - .AddAttr("data_format") - .IsStringIn({"NCHW", "AnyLayout"}) - .End(); - - AddOpCompat(OpCompat("affine_channel")) - .AddInput("X") - .IsTensor() - .End() - .AddInput("Scale") - .IsTensor() - .End() - .AddInput("Bias") - .IsTensor() - .IsOptional() - .End() - .AddOutput("Out") - .IsTensor() - .End() - .AddAttr("data_layout") - .IsStringIn({"NCHW", "AnyLayout"}) - .End(); - - AddOpCompat(OpCompat("elementwise_add")) - .AddInput("X") - .IsTensor() - .End() - .AddInput("Y") - .IsTensor() - .End() - .AddOutput("Out") - .IsTensor() - .End() - .AddAttr("axis") - .IsNumEQ(1) - .End(); -} - -void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { - PADDLE_ENFORCE_NOT_NULL( - graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); - FusePassBase::Init(name_scope_, graph); - - auto* scope = param_scope(); - PADDLE_ENFORCE_NOT_NULL( - scope, platform::errors::InvalidArgument("Scope cannot be nullptr.")); - - GraphPatternDetector gpd; - auto* conv_input = - gpd.mutable_pattern() - ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) - ->AsInput() - ->assert_is_op_input("conv2d", "Input"); - patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(), - name_scope_); - conv_ac_pattern(conv_input, false /*with_eltwise_add*/); - - int found_conv_ac_count = 0; - auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, - Graph* g) { - if (!IsCompat(subgraph, g)) { - LOG(WARNING) << "ConvAffineChannelFusePass in op compat failed."; - return; - } - - VLOG(4) << "handle ConvAffineChannel fuse"; - - GET_CONV_BN_NODES(conv_ac_pattern); - - auto data_format = conv->Op()->GetAttrIfExists("data_format"); - if (data_format == "AnyLayout") { - LOG_FIRST_N(WARNING, 1) << "conv_affine_channel_fuse_pass is enabled, " - "it's wrong if data_format of conv is not " - "NCHW."; - } - - // Get affine_channel bias for resizing eltwise_y! - auto* ac_bias_tensor = - scope->FindVar(ac_bias->Name())->GetMutable(); - - // Create eltwise_y (conv bias) variable - VarDesc eltwise_y_in_desc( - patterns::PDNodeName(name_scope_, "eltwise_y_in")); - // Set shape && datatype manually - eltwise_y_in_desc.SetShape(phi::vectorize(ac_bias_tensor->dims())); - eltwise_y_in_desc.SetDataType( - framework::TransToProtoVarType(ac_bias_tensor->dtype())); - eltwise_y_in_desc.SetLoDLevel(ac_bias->Var()->GetLoDLevel()); - eltwise_y_in_desc.SetPersistable(true); - - // Initialize eltwise_y - auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc); - auto* eltwise_y_in_tensor = - scope->Var(eltwise_y_in_node->Name())->GetMutable(); - eltwise_y_in_tensor->Resize(ac_bias_tensor->dims()); - std::fill_n(eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), - eltwise_y_in_tensor->numel(), 0.0f); - - // update weights and biases - recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor, - eltwise_y_in_tensor); - - // create an elementwise add node. - OpDesc desc; - desc.SetInput("X", std::vector({conv_out->Name()})); - desc.SetInput("Y", std::vector({eltwise_y_in_node->Name()})); - desc.SetOutput("Out", std::vector({ac_out->Name()})); - desc.SetType("elementwise_add"); - desc.SetAttr("axis", 1); - desc.SetAttr("use_mkldnn", conv->Op()->GetAttrIfExists("use_mkldnn")); - - auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied. - - GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel}); - - IR_NODE_LINK_TO(conv_out, eltwise_op); - IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op); - IR_NODE_LINK_TO(eltwise_op, ac_out); - found_conv_ac_count++; - }; - - gpd(graph, handler); - - AddStatis(found_conv_ac_count); -} - -ConvEltwiseAddAffineChannelFusePass::ConvEltwiseAddAffineChannelFusePass() { - AddOpCompat(OpCompat("conv2d")) - .AddInput("Input") - .IsTensor() - .End() - .AddInput("Filter") - .IsTensor() - .End() - .AddInput("Bias") - .IsTensor() - .IsOptional() - .End() - .AddInput("ResidualData") - .IsTensor() - .IsOptional() - .End() - .AddOutput("Output") - .IsTensor() - .End() - .AddAttr("strides") - .IsType>() - .End() - .AddAttr("paddings") - .IsType>() - .End() - .AddAttr("padding_algorithm") - .IsOptional() - .IsStringIn({"EXPLICIT", "SAME", "VALID"}) - .End() - .AddAttr("groups") - .IsNumGE(1) - .End() - .AddAttr("dilations") - .IsType>() - .End() - .AddAttr("data_format") - .IsStringIn({"NCHW", "AnyLayout"}) - .End(); - AddOpCompat(OpCompat("affine_channel")) - .AddInput("X") - .IsTensor() - .End() - .AddInput("Scale") - .IsTensor() - .End() - .AddInput("Bias") - .IsTensor() - .IsOptional() - .End() - .AddOutput("Out") - .IsTensor() - .End() - .AddAttr("data_layout") - .IsStringIn({"NCHW", "AnyLayout"}) - .End(); - AddOpCompat(OpCompat("elementwise_add")) - .AddInput("X") - .IsTensor() - .End() - .AddInput("Y") - .IsTensor() - .End() - .AddOutput("Out") - .IsTensor() - .End() - .AddAttr("axis") - .IsNumEQ(1) - .End(); -} - -void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { - PADDLE_ENFORCE_NOT_NULL( - graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); - FusePassBase::Init(name_scope_, graph); - - auto* scope = param_scope(); - PADDLE_ENFORCE_NOT_NULL( - scope, platform::errors::InvalidArgument("Scope cannot be nullptr.")); - - GraphPatternDetector gpd; - auto* conv_input = - gpd.mutable_pattern() - ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) - ->AsInput() - ->assert_is_op_input("conv2d", "Input"); - patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(), - name_scope_); - conv_ac_pattern(conv_input, true /*with_eltwise_add*/); - - int found_conv_ac_count = 0; - auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, - Graph* g) { - if (!IsCompat(subgraph, g)) { - LOG(WARNING) - << "ConvEltwiseAddAffineChannelFusePass in op compat failed."; - return; - } - - VLOG(4) << "handle ConvBN fuse"; - - GET_CONV_BN_NODES(conv_ac_pattern); - auto data_format = conv->Op()->GetAttrIfExists("data_format"); - if (data_format == "AnyLayout") { - LOG_FIRST_N(WARNING, 1) << "conv_eltwiseadd_affine_channel_fuse_pass is " - "enabled, it's wrong if data_format of conv " - "is not NCHW."; - } - // OPERATORS - GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern); - // BIAS inputs - GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern); - // BIAS outputs - GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern); - - // Get eltwise_y (conv bias) variable - auto* eltwise_y_in_tensor = - scope->FindVar(eltwise_y_in->Name())->GetMutable(); - - // Get batch norm bias - auto* ac_bias_tensor = - scope->FindVar(ac_bias->Name())->GetMutable(); - - recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor, - eltwise_y_in_tensor); - - // Update the elementwise_add node - eltwise->Op()->SetAttr("axis", 1); - eltwise->Op()->SetOutput("Out", std::vector({ac_out->Name()})); - - GraphSafeRemoveNodes(graph, - {ac_scale, ac_bias, affine_channel, eltwise_out}); - - IR_NODE_LINK_TO(eltwise, ac_out); - - found_conv_ac_count++; - }; - - gpd(graph, handler); - AddStatis(found_conv_ac_count); -} - -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_PASS(conv_affine_channel_fuse_pass, - paddle::framework::ir::ConvAffineChannelFusePass); -REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass, - paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass); -REGISTER_PASS_CAPABILITY(conv_affine_channel_fuse_pass) - .AddCombination( - paddle::framework::compatible::OpVersionComparatorCombination() - .LE("conv2d", 1) - .EQ("affine_channel", 0)); -REGISTER_PASS_CAPABILITY(conv_eltwiseadd_affine_channel_fuse_pass) - .AddCombination( - paddle::framework::compatible::OpVersionComparatorCombination() - .LE("conv2d", 1) - .LE("elementwise_add", 1) - .EQ("affine_channel", 0)); diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h deleted file mode 100644 index 8cfaf5c6a89f06b453dbbc94b5a7fe8b83e5c111..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" - -namespace paddle { -namespace framework { -namespace ir { - -/* - * Fuse the Conv and ConvAffineChannel. - */ -class Graph; - -class ConvAffineChannelFusePass : public FusePassBase { - public: - ConvAffineChannelFusePass(); - virtual ~ConvAffineChannelFusePass() {} - - protected: - void ApplyImpl(ir::Graph*) const override; - const std::string name_scope_{"conv_affine_channel_fuse"}; -}; - -class ConvEltwiseAddAffineChannelFusePass : public FusePassBase { - public: - ConvEltwiseAddAffineChannelFusePass(); - virtual ~ConvEltwiseAddAffineChannelFusePass() {} - - protected: - void ApplyImpl(ir::Graph*) const override; - const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"}; -}; - -} // namespace ir -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..f48224cbdc24fe9706a3c4eae029c6dc35381ad2 --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc @@ -0,0 +1,471 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h" +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +void FuseGemmEpiloguePass::ApplyImpl(ir::Graph *graph) const { + EpiloguePassActivationCache cache; + + graph = FuseLinearActFwd(graph, {"relu", "gelu"}, false, false, &cache); + graph = FuseLinearActFwd(graph, {"relu"}, true, true, &cache); + graph = FuseLinearActFwd(graph, {"gelu"}, true, false, &cache); + graph = FuseLinearFwd(graph, false); + graph = FuseLinearFwd(graph, true); + graph = FuseLinearActBwd(graph, {"relu_grad"}, true, &cache); + graph = FuseLinearActBwd(graph, {"gelu_grad"}, false, &cache); + graph = FuseLinearBwd(graph, false); + graph = FuseLinearBwd(graph, true); +} + +ir::Graph *FuseGemmEpiloguePass::FuseLinearFwd(ir::Graph *graph, + bool is_training) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + const std::string scope_name("gemm_epilogue"); + FusePassBase::Init(scope_name, graph); + + GraphPatternDetector gpd; + auto *x = gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(scope_name, "x")) + ->AsInput() + ->assert_is_op_input("matmul_v2", "X"); + patterns::LinearAct linear_act_pattern(gpd.mutable_pattern(), "linear_act"); + + linear_act_pattern(x, {}, is_training, false); + + int found_linear_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle LinearAct fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_w, matmul_w, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_add_op, ele_add, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_bias, ele_bias, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, linear_act_pattern); + + std::vector matmul_x_shape = subgraph.at(x)->Var()->GetShape(); + std::vector matmul_w_shape = matmul_w->Var()->GetShape(); + + // Note (Ming Huang): We only support matmul_v2 from paddle.nn.Linear + // currently. The conditions below are used to verify wether matmul_v2 + // is created by paddle.nn.Linear + auto matmul_op_desc = matmul_op->Op(); + if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc)) + return; + + OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block()); + std::string activation = "none"; + fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue"); + fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()}); + fused_gemm_epilogue_op_desc.SetInput("Y", {matmul_w->Name()}); + fused_gemm_epilogue_op_desc.SetInput("Bias", {ele_bias->Name()}); + fused_gemm_epilogue_op_desc.SetOutput("Out", {ele_out->Name()}); + fused_gemm_epilogue_op_desc.SetAttr("activation", activation); + fused_gemm_epilogue_op_desc.SetAttr("op_role", + matmul_op_desc->GetAttr("op_role")); + auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc); + + IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node); + IR_NODE_LINK_TO(matmul_w, gemm_epilogue_node); + IR_NODE_LINK_TO(ele_bias, gemm_epilogue_node); + IR_NODE_LINK_TO(gemm_epilogue_node, ele_out); + + GraphSafeRemoveNodes(g, {matmul_op, matmul_out, ele_add_op}); + + VLOG(4) << "\n\t " << subgraph.at(x)->Name() << " and " << matmul_w->Name() + << " -> " << matmul_op->Name() << " -> " << matmul_out->Name() + << "\n\t " << matmul_out->Name() << " and " << ele_bias->Name() + << " -> " << ele_add_op->Name() << " -> " << ele_out->Name() + << "\n\t " << ele_out->Name(); + found_linear_count++; + }; + + gpd(graph, handler); + + AddStatis(found_linear_count); + return graph; +} + +ir::Graph *FuseGemmEpiloguePass::FuseLinearActFwd( + ir::Graph *graph, const std::unordered_set &act_types, + bool is_training, bool is_act_grad_x_from_act, + EpiloguePassActivationCache *cache) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + + const std::string scope_name("gemm_epilogue"); + FusePassBase::Init(scope_name, graph); + + GraphPatternDetector gpd; + auto *x = gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(scope_name, "x")) + ->AsInput() + ->assert_is_op_input("matmul_v2", "X"); + patterns::LinearAct linear_act_pattern(gpd.mutable_pattern(), "linear_act"); + + linear_act_pattern(x, act_types, is_training, is_act_grad_x_from_act); + + int found_linear_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle LinearAct fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_w, matmul_w, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_add_op, ele_add, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_bias, ele_bias, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_op, act, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, linear_act_pattern); + + std::vector matmul_x_shape = subgraph.at(x)->Var()->GetShape(); + std::vector matmul_w_shape = matmul_w->Var()->GetShape(); + + // Note (Ming Huang): We only support matmul_v2 from paddle.nn.Linear + // currently. The conditions below are used to verify wether matmul_v2 + // is created by paddle.nn.Linear + auto matmul_op_desc = matmul_op->Op(); + if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc)) + return; + + auto activation = act_op->Op()->Type(); + + OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block()); + fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue"); + fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()}); + fused_gemm_epilogue_op_desc.SetInput("Y", {matmul_w->Name()}); + fused_gemm_epilogue_op_desc.SetInput("Bias", {ele_bias->Name()}); + fused_gemm_epilogue_op_desc.SetOutput("Out", {act_out->Name()}); + fused_gemm_epilogue_op_desc.SetAttr("activation", activation); + fused_gemm_epilogue_op_desc.SetAttr("op_role", + matmul_op_desc->GetAttr("op_role")); + + auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc); + + IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node); + IR_NODE_LINK_TO(matmul_w, gemm_epilogue_node); + IR_NODE_LINK_TO(ele_bias, gemm_epilogue_node); + IR_NODE_LINK_TO(gemm_epilogue_node, act_out); + + // Only need to check weight.shape[1] for auxiliary pointer + // and mark it the act op is fused for backward epilogue fusion. + // That because cuBlasLt epilogue's restriction. + if (is_training) { + int divisor_of_n = activation == "relu" ? 128 : 8; + if (matmul_w_shape[1] % divisor_of_n) return; + + VarDesc reserve_space(patterns::PDNodeName(scope_name, "ReserveSpace")); + auto *reserve_space_node = g->CreateVarNode(&reserve_space); + + cache->InsertFusedActivation( + GetReserveSpaceCacheKey(act_out->Var()->Name(), g->GetBlockId()), + reserve_space_node); + + gemm_epilogue_node->Op()->SetOutput("ReserveSpace", + {reserve_space_node->Name()}); + + if (!is_act_grad_x_from_act) { + GET_IR_NODE_FROM_SUBGRAPH(act_grad_op, act_grad, linear_act_pattern); + act_grad_op->Op()->RenameInput(ele_out->Name(), + reserve_space_node->Name()); + IR_NODE_LINK_TO(reserve_space_node, act_grad_op); + } + IR_NODE_LINK_TO(gemm_epilogue_node, reserve_space_node); + } + + GraphSafeRemoveNodes(g, + {matmul_op, matmul_out, ele_add_op, ele_out, act_op}); + + VLOG(4) << "\n\t " << subgraph.at(x)->Name() << " and " << matmul_w->Name() + << " -> " << matmul_op->Name() << " -> " << matmul_out->Name() + << "\n\t " << matmul_out->Name() << " and " << ele_bias->Name() + << " -> " << ele_add_op->Name() << " -> " << ele_out->Name() + << "\n\t " << ele_out->Name() << " -> " << act_op->Name() << " -> " + << act_out->Name(); + found_linear_act_count++; + }; + + gpd(graph, handler); + + AddStatis(found_linear_act_count); + return graph; +} + +ir::Graph *FuseGemmEpiloguePass::FuseLinearBwd(ir::Graph *graph, + bool without_x_gradient) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + const std::string scope_name("gemm_epilogue"); + FusePassBase::Init(scope_name, graph); + + GraphPatternDetector gpd; + auto *dout = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(scope_name, "dout")) + ->AsInput() + ->assert_is_op_input("elementwise_add_grad", GradVarName("Out")); + + patterns::ElewiseAddMatmulAct ele_add_matmul_act_pattern( + gpd.mutable_pattern(), "ele_add_matmul_act"); + ele_add_matmul_act_pattern(dout, {}, without_x_gradient, false); + + int found_ele_add_matmul_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle ElewiseAddMatmulAct fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad_op, ele_add_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_bias, ele_grad_bias, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dx, ele_grad_dx, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dbias, ele_grad_dbias, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_op, matmul_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_x, matmul_grad_x, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_w, matmul_grad_w, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dw, matmul_grad_dw, + ele_add_matmul_act_pattern); + + Node *matmul_grad_dx = nullptr; + if (!without_x_gradient) { + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dx_ptr, matmul_grad_dx, + ele_add_matmul_act_pattern); + matmul_grad_dx = matmul_grad_dx_ptr; + } + + std::vector matmul_grad_x_shape = matmul_grad_x->Var()->GetShape(); + std::vector matmul_grad_w_shape = matmul_grad_w->Var()->GetShape(); + + // Note (Ming Huang): We only support matmul_v2_grad from paddle.nn.Linear + // currently. The conditions below are used to verify wether matmul_v2 + // is created by paddle.nn.Linear + auto matmul_grad_op_desc = matmul_grad_op->Op(); + if (!IsGemmFromLinear_(matmul_grad_x_shape, matmul_grad_w_shape, + matmul_grad_op_desc)) + return; + + OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block()); + std::string activation_grad = "none"; + fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad"); + fused_gemm_epilogue_grad_op_desc.SetInput("DOut", + {subgraph.at(dout)->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("X", {matmul_grad_x->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("Y", {matmul_grad_w->Name()}); + if (matmul_grad_dx) { + fused_gemm_epilogue_grad_op_desc.SetOutput("DX", + {matmul_grad_dx->Name()}); + } + fused_gemm_epilogue_grad_op_desc.SetOutput("DY", {matmul_grad_dw->Name()}); + fused_gemm_epilogue_grad_op_desc.SetOutput("DBias", + {ele_grad_dbias->Name()}); + fused_gemm_epilogue_grad_op_desc.SetAttr("activation_grad", + activation_grad); + fused_gemm_epilogue_grad_op_desc.SetAttr( + "op_role", matmul_grad_op_desc->GetAttr("op_role")); + + auto gemm_epilogue_grad_node = + g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc); + + IR_NODE_LINK_TO(subgraph.at(dout), gemm_epilogue_grad_node); + IR_NODE_LINK_TO(matmul_grad_x, gemm_epilogue_grad_node); + IR_NODE_LINK_TO(matmul_grad_w, gemm_epilogue_grad_node); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dw); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, ele_grad_dbias); + if (matmul_grad_dx) { + IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dx); + } + + GraphSafeRemoveNodes(g, {ele_add_grad_op, ele_grad_dx, matmul_grad_op}); + + std::string matmul_grad_dx_name = + matmul_grad_dx != nullptr ? matmul_grad_dx->Name() : " "; + VLOG(4) << "\n\t " << subgraph.at(dout)->Name() << " and " + << ele_grad_bias->Name() << " -> " << ele_add_grad_op->Name() + << " -> " << ele_grad_dx->Name() << " and " + << ele_grad_dbias->Name() << "\n\t " << ele_grad_dx->Name() << ", " + << matmul_grad_x->Name() << " and " << matmul_grad_w->Name() + << " -> " << matmul_grad_op->Name() << " -> " + << matmul_grad_w->Name() << " and " << matmul_grad_dx_name; + found_ele_add_matmul_act_count++; + }; + + gpd(graph, handler); + + AddStatis(found_ele_add_matmul_act_count); + return graph; +} + +ir::Graph *FuseGemmEpiloguePass::FuseLinearActBwd( + ir::Graph *graph, const std::unordered_set &act_grad_types, + bool is_act_grad_x_from_act, EpiloguePassActivationCache *cache) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + const std::string scope_name("gemm_epilogue"); + FusePassBase::Init(scope_name, graph); + + GraphPatternDetector gpd; + auto *dout = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(scope_name, "dout")) + ->AsInput() + ->assert_is_op_input("elementwise_add_grad", GradVarName("Out")); + + patterns::ElewiseAddMatmulAct ele_add_matmul_act_pattern( + gpd.mutable_pattern(), "ele_add_matmul_act"); + ele_add_matmul_act_pattern(dout, act_grad_types, false, + is_act_grad_x_from_act); + + int found_ele_add_matmul_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle ElewiseAddMatmulAct fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad_op, ele_add_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_bias, ele_grad_bias, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dx, ele_grad_dx, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dbias, ele_grad_dbias, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_op, matmul_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_x, matmul_grad_x, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_w, matmul_grad_w, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dx, matmul_grad_dx, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dw, matmul_grad_dw, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_grad_op, act_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_grad_dx, act_grad_dx, + ele_add_matmul_act_pattern); + + auto key = + GetReserveSpaceCacheKey(matmul_grad_x->Var()->Name(), g->GetBlockId()); + if (!cache->HasFusedActivation(key)) { + return; + } + auto *reserve_space_node = cache->GetFusedActivationSpace(key); + + std::vector matmul_grad_x_shape = matmul_grad_x->Var()->GetShape(); + std::vector matmul_grad_w_shape = matmul_grad_w->Var()->GetShape(); + + // Note (Ming Huang): We only support matmul_v2_grad from paddle.nn.Linear + // currently. The conditions below are used to verify wether matmul_v2 + // is created by paddle.nn.Linear + auto matmul_grad_op_desc = matmul_grad_op->Op(); + if (!IsGemmFromLinear_(matmul_grad_x_shape, matmul_grad_w_shape, + matmul_grad_op_desc)) + return; + + auto activation_grad = act_grad_op->Op()->Type(); + + OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block()); + fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad"); + fused_gemm_epilogue_grad_op_desc.SetInput("DOut", + {subgraph.at(dout)->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("X", {matmul_grad_x->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("Y", {matmul_grad_w->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("ReserveSpace", + {reserve_space_node->Name()}); + fused_gemm_epilogue_grad_op_desc.SetOutput("DX", {act_grad_dx->Name()}); + fused_gemm_epilogue_grad_op_desc.SetOutput("DY", {matmul_grad_dw->Name()}); + fused_gemm_epilogue_grad_op_desc.SetOutput("DBias", + {ele_grad_dbias->Name()}); + fused_gemm_epilogue_grad_op_desc.SetAttr("activation_grad", + activation_grad); + fused_gemm_epilogue_grad_op_desc.SetAttr( + "op_role", matmul_grad_op_desc->GetAttr("op_role")); + + auto gemm_epilogue_grad_node = + g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc); + + IR_NODE_LINK_TO(subgraph.at(dout), gemm_epilogue_grad_node); + IR_NODE_LINK_TO(matmul_grad_x, gemm_epilogue_grad_node); + IR_NODE_LINK_TO(matmul_grad_w, gemm_epilogue_grad_node); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, act_grad_dx); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dw); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, ele_grad_dbias); + IR_NODE_LINK_TO(reserve_space_node, gemm_epilogue_grad_node); + + GraphSafeRemoveNodes(g, {ele_add_grad_op, ele_grad_dx, matmul_grad_op, + matmul_grad_dx, act_grad_op}); + + VLOG(4) << "\n\t " << subgraph.at(dout)->Name() << " and " + << ele_grad_bias->Name() << " -> " << ele_add_grad_op->Name() + << " -> " << ele_grad_dx->Name() << " and " + << ele_grad_dbias->Name() << "\n\t " << ele_grad_dx->Name() << ", " + << matmul_grad_x->Name() << " and " << matmul_grad_w->Name() + << " -> " << matmul_grad_op->Name() << " -> " + << matmul_grad_dx->Name() << " and " << matmul_grad_w->Name() + << "\n\t " << matmul_grad_dx->Name() << " -> " + << act_grad_op->Name() << " -> " << act_grad_dx->Name(); + found_ele_add_matmul_act_count++; + }; + + gpd(graph, handler); + + AddStatis(found_ele_add_matmul_act_count); + return graph; +} + +bool FuseGemmEpiloguePass::IsGemmFromLinear_( + const std::vector &x_shape, const std::vector &w_shape, + OpDesc *matmul_v2_op) const { + if (w_shape.size() != 2 || x_shape.size() < 2) return false; + for (auto attr_name : + {"fused_reshape_Out", "fused_reshape_X", "fused_reshape_Y", + "fused_transpose_Out", "fused_transpose_X", "fused_transpose_Y"}) { + if (matmul_v2_op->HasAttr(attr_name)) { + std::vector tmp_vec = + BOOST_GET_CONST(std::vector, matmul_v2_op->GetAttr(attr_name)); + if (tmp_vec.size() > 0) return false; + } + } + if (BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_x")) || + BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_y"))) + return false; + + return true; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_gemm_epilogue_pass, + paddle::framework::ir::FuseGemmEpiloguePass); diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..575ffee73d60e9bd5d4f5af7538d01789268cc9a --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h @@ -0,0 +1,100 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the ElewiseAdd and activation + */ +class Graph; +class Node; + +class EpiloguePassActivationCache { + public: + EpiloguePassActivationCache() {} + + EpiloguePassActivationCache(const EpiloguePassActivationCache &) = delete; + void operator=(const EpiloguePassActivationCache &) = delete; + + bool HasFusedActivation(const std::string &key) const { + return fused_activation_space_map_.count(key); + } + + ir::Node *GetFusedActivationSpace(const std::string &key) { + if (HasFusedActivation(key)) { + return fused_activation_space_map_.find(key)->second; + } + PADDLE_THROW(platform::errors::InvalidArgument( + "The key (%d) of EpiloguePassActivationCache does not exist.", key)); + } + + void InsertFusedActivation(const std::string &key, ir::Node *const value) { + if (!HasFusedActivation(key)) { + mtx.lock(); + fused_activation_space_map_.insert({key, value}); + mtx.unlock(); + } else { + PADDLE_THROW(platform::errors::AlreadyExists( + "The key (%d) of EpiloguePassActivationCache already exist.", key)); + } + } + + private: + std::unordered_map fused_activation_space_map_; + std::mutex mtx; +}; + +class FuseGemmEpiloguePass : public FusePassBase { + public: + virtual ~FuseGemmEpiloguePass() {} + + protected: + void ApplyImpl(ir::Graph *graph) const override; + + ir::Graph *FuseLinearFwd(ir::Graph *graph, bool is_training) const; + ir::Graph *FuseLinearActFwd(ir::Graph *graph, + const std::unordered_set &act_types, + bool is_training, bool is_act_grad_x_from_act, + EpiloguePassActivationCache *cache) const; + ir::Graph *FuseLinearBwd(ir::Graph *graph, bool without_x_gradient) const; + ir::Graph *FuseLinearActBwd( + ir::Graph *graph, const std::unordered_set &act_grad_types, + bool is_act_grad_x_from_act, EpiloguePassActivationCache *cache) const; + + private: + bool IsGemmFromLinear_(const std::vector &x_shape, + const std::vector &w_shape, + OpDesc *matmul_v2_op) const; + const std::string GetReserveSpaceCacheKey(const std::string var_name, + int block_id) const { + return std::to_string(block_id) + var_name; + } +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index e4c9dc72128f4850b2e0e4af739fdd381e4a3b1e..d7d866fa98bb5895e4f3175e227f7b3c2ce869b6 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1461,31 +1461,6 @@ PDNode *patterns::BatchNormAddActGrad::operator()( return bn_grad; } -PDNode *patterns::ElewiseAddAct::operator()( - paddle::framework::ir::PDNode *ele_x_var, - std::unordered_set act_types) { - auto *ele_y_var = pattern->NewNode(ele_y_repr()) - ->assert_is_op_input("elementwise_add", "Y"); - - auto *ele_add = - pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add"); - - auto *ele_out_var = pattern->NewNode(elewise_add_out_repr()) - ->assert_is_op_output("elementwise_add", "Out"); - - ele_out_var->AsIntermediate()->assert_is_ops_input(act_types); - - auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types); - - auto *act_out_var = - pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out"); - - ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var}); - act->LinksFrom({ele_out_var}).LinksTo({act_out_var}); - - return act_out_var; -} - PDNode *patterns::ElewiseAddActInplaceGrad::operator()( paddle::framework::ir::PDNode *d_act_out_var, std::unordered_set act_types) { @@ -1526,6 +1501,159 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()( return ele_add_grad; } +PDNode *patterns::ElewiseAddAct::operator()( + paddle::framework::ir::PDNode *ele_x_var, + std::unordered_set act_types) { + auto *ele_y_var = pattern->NewNode(ele_y_repr()) + ->assert_is_op_input("elementwise_add", "Y"); + + auto *ele_add = + pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add"); + + auto *ele_out_var = pattern->NewNode(elewise_add_out_repr()) + ->assert_is_op_output("elementwise_add", "Out"); + + ele_out_var->AsIntermediate()->assert_is_ops_input(act_types); + + auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types); + + auto *act_out_var = + pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out"); + + ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var}); + act->LinksFrom({ele_out_var}).LinksTo({act_out_var}); + + return act_out_var; +} + +PDNode *patterns::LinearAct::operator()( + paddle::framework::ir::PDNode *linear_x_var, + const std::unordered_set &act_types, bool with_grad_link, + bool is_act_grad_x_from_act) { + auto *matmul_w_var = + pattern->NewNode(matmul_w_repr())->assert_is_op_input("matmul_v2", "Y"); + + auto *matmul = pattern->NewNode(matmul_repr())->assert_is_op("matmul_v2"); + + auto *matmul_out_var = pattern->NewNode(matmul_out_repr()) + ->assert_is_op_output("matmul_v2", "Out"); + + matmul_out_var->AsIntermediate()->assert_is_op_input("elementwise_add", "X"); + + auto *ele_bias_var = pattern->NewNode(ele_bias_repr()) + ->assert_is_op_input("elementwise_add", "Y"); + + auto *ele_add = + pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add"); + + auto *ele_out_var = pattern->NewNode(elewise_add_out_repr()) + ->assert_is_op_output("elementwise_add", "Out"); + + matmul->LinksFrom({linear_x_var, matmul_w_var}).LinksTo({matmul_out_var}); + ele_add->LinksFrom({matmul_out_var, ele_bias_var}).LinksTo({ele_out_var}); + + if (with_grad_link) { + matmul_out_var->assert_is_op_input("elementwise_add_grad", "X"); + auto *elementwise_add_grad_op = pattern->NewNode("elementwise_add_grad") + ->assert_is_op("elementwise_add_grad"); + elementwise_add_grad_op->LinksFrom({matmul_out_var}); + } + + if (act_types.size() > 0) { + ele_out_var->AsIntermediate()->assert_is_ops_input(act_types); + + auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types); + auto *act_out_var = pattern->NewNode(act_out_repr()) + ->assert_is_ops_output(act_types, "Out"); + + act->LinksFrom({ele_out_var}).LinksTo({act_out_var}); + + if (with_grad_link && !is_act_grad_x_from_act) { + std::unordered_set act_grad_types; + for (const auto &act : act_types) { + std::string act_grad(act); + act_grad.append("_grad"); + act_grad_types.insert(act_grad); + } + + ele_out_var->assert_is_ops_input(act_grad_types, "X"); + auto *act_grad_op = + pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types); + act_grad_op->LinksFrom({ele_out_var}); + } + + return act_out_var; + } + + return ele_out_var; +} + +PDNode *patterns::ElewiseAddMatmulAct::operator()( + paddle::framework::ir::PDNode *dout_var, + const std::unordered_set &act_grad_types, + bool without_x_gradient, bool is_act_grad_x_from_act) { + auto *ele_grad_bias_var = + pattern->NewNode(ele_grad_bias_repr()) + ->assert_is_op_input("elementwise_add_grad", "Y"); + auto *ele_add_grad = pattern->NewNode(ele_add_grad_repr()) + ->assert_is_op("elementwise_add_grad"); + auto *ele_grad_dx_var = + pattern->NewNode(ele_grad_dx_repr()) + ->assert_is_op_output("elementwise_add_grad", GradVarName("X")); + auto *ele_grad_dbias_var = + pattern->NewNode(ele_grad_dbias_repr()) + ->assert_is_op_output("elementwise_add_grad", GradVarName("Y")); + ele_add_grad->LinksFrom({dout_var, ele_grad_bias_var}) + .LinksTo({ele_grad_dx_var, ele_grad_dbias_var}); + + ele_grad_dx_var->AsIntermediate()->assert_is_op_input("matmul_v2_grad", + GradVarName("Out")); + + auto *matmul_grad_x_var = pattern->NewNode(matmul_grad_x_repr()) + ->assert_is_op_input("matmul_v2_grad", "X"); + auto *matmul_grad_w_var = pattern->NewNode(matmul_grad_w_repr()) + ->assert_is_op_input("matmul_v2_grad", "Y"); + auto *matmul_grad = + pattern->NewNode(matmul_grad_repr())->assert_is_op("matmul_v2_grad"); + auto *matmul_grad_dx_var = + pattern->NewNode(matmul_grad_dx_repr()) + ->assert_is_op_output("matmul_v2_grad", GradVarName("X")); + auto *matmul_grad_dw_var = + pattern->NewNode(matmul_grad_dw_repr()) + ->assert_is_op_output("matmul_v2_grad", GradVarName("Y")); + matmul_grad->LinksFrom( + {ele_grad_dx_var, matmul_grad_x_var, matmul_grad_w_var}); + if (without_x_gradient) { + matmul_grad->LinksTo({matmul_grad_dw_var}); + } else { + matmul_grad->LinksTo({matmul_grad_dx_var, matmul_grad_dw_var}); + } + + if (!without_x_gradient && act_grad_types.size() > 0) { + matmul_grad_dx_var->AsIntermediate()->assert_is_ops_input( + act_grad_types, GradVarName("Out")); + + auto *act_grad = + pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types); + auto *act_grad_dx_var = + pattern->NewNode(act_grad_dx_repr()) + ->assert_is_ops_output(act_grad_types, GradVarName("X")); + + auto *act_grad_x_var = matmul_grad_x_var; + if (!is_act_grad_x_from_act) { + auto *ele_out_var = pattern->NewNode(ele_out_repr()) + ->assert_is_ops_input(act_grad_types, "X"); + act_grad_x_var = ele_out_var; + } + + act_grad->LinksFrom({matmul_grad_dx_var, act_grad_x_var}) + .LinksTo({act_grad_dx_var}); + return act_grad; + } + + return matmul_grad; +} + // conv_type: conv2d, conv3d, conv2d_transpose PDNode *patterns::ConvBias::operator()( paddle::framework::ir::PDNode *conv_input, std::string conv_type) { diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index d6400ed6945bf8a60c1d4f357bf58a11d5b87094..0f21906d08d0e4fc8a54472ab40ceb08df9d1949 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -863,6 +863,65 @@ struct ElewiseAddActInplaceGrad : public PatternBase { PATTERN_DECL_NODE(ele_y); }; +// The following patterns are used to fuse linear and act (ReLu or GeLU) +// formula: act(F.linear(x)) +// op: matmul_v2 + elementwise_add + act +// named nodes: matmul, elementwise_add, act +// matmul_w, matmul_out +// ele_bias, elewise_add_out, act_out +struct LinearAct : public PatternBase { + LinearAct(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "linear_act") {} + + PDNode* operator()(PDNode* x, + const std::unordered_set& act_types, + bool with_grad_link, bool is_act_grad_x_from_act); + + // declare operator node's name + PATTERN_DECL_NODE(matmul); + PATTERN_DECL_NODE(ele_add); + PATTERN_DECL_NODE(act); + PATTERN_DECL_NODE(act_grad); + // declare variable node's name + PATTERN_DECL_NODE(matmul_w); + PATTERN_DECL_NODE(matmul_out); + PATTERN_DECL_NODE(elewise_add_out); + PATTERN_DECL_NODE(ele_bias); + PATTERN_DECL_NODE(act_out); +}; + +// The following patterns are used to fuse linear_grad and act_grad (ReLu or +// GeLU) +// formula: the backward of F.linear( act(x) ) +// op: elementwise_add_grad + matmul_v2_grad + act_grad +// named nodes: ele_add_grad, matmul_grad, act_grad +// ele_grad_bias, ele_grad_dx, ele_grad_dbias +// matmul_grad_x, matmul_grad_dx, matmul_grad_dx +// matmul_grad_dw, act_grad_dx +struct ElewiseAddMatmulAct : public PatternBase { + ElewiseAddMatmulAct(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "elewiseadd_matmul_act") {} + + PDNode* operator()(PDNode* x, + const std::unordered_set& act_grad_types, + bool without_x_gradient, bool is_act_grad_x_from_act); + + // declare operator node's name + PATTERN_DECL_NODE(ele_add_grad); + PATTERN_DECL_NODE(matmul_grad); + PATTERN_DECL_NODE(act_grad); + // declare variable node's name + PATTERN_DECL_NODE(ele_out); + PATTERN_DECL_NODE(ele_grad_bias); + PATTERN_DECL_NODE(ele_grad_dx); + PATTERN_DECL_NODE(ele_grad_dbias); + PATTERN_DECL_NODE(matmul_grad_x); + PATTERN_DECL_NODE(matmul_grad_w); + PATTERN_DECL_NODE(matmul_grad_dx); + PATTERN_DECL_NODE(matmul_grad_dw); + PATTERN_DECL_NODE(act_grad_dx); +}; + // Conv with Elementwise_add as bias // op: conv + elementwise_add // named nodes: diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc index d33dc7f49feb0f4c9e585d13186d65b6c2d618c0..636a594a657cb0744aac161d928ff9078b1f92bc 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc @@ -20,12 +20,15 @@ #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" #include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(scale); USE_OP(elementwise_mul); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add_grad); +PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT); + DECLARE_double(eager_delete_tensor_gb); namespace paddle { diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc index c537d05738529dcb885e86cbcabf4405fd75270b..2403e60df3918394e99c3284b2a417e336fc3bae 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc @@ -22,6 +22,7 @@ #include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/string/pretty_log.h" namespace paddle { namespace framework { @@ -135,157 +136,9 @@ ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() { .End(); } -ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::IdentityFuseHandle( - const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func, - const ResidualConnectionMKLDNNFusePass::IdentityConvFunc& - get_node_from_conv_op, - const ResidualConnectionMKLDNNFusePass::IdentityElementwiseAddFunc& - get_node_from_elementwise_add_op, - const ResidualConnectionMKLDNNFusePass* pass) - : fusion_stats{std::make_shared(0)}, - can_fuse_func{can_fuse_func}, - get_node_from_conv_op{get_node_from_conv_op}, - get_node_from_elementwise_add_op{get_node_from_elementwise_add_op}, - pass_{pass} {} - -void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()( - const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { - Node* conv_op; - Node* conv_input; - Node* conv_filter; - Node* conv_output; - - Node* elementwise_add_op; - Node* elementwise_add_identity; - Node* elementwise_add_out; - - std::tie(conv_op, conv_input, conv_filter, conv_output) = - get_node_from_conv_op(subgraph); - std::tie(elementwise_add_op, elementwise_add_identity, elementwise_add_out) = - get_node_from_elementwise_add_op(subgraph); - - if (!can_fuse_func(conv_op, elementwise_add_op)) return; - - if (!IsReachable(graph, elementwise_add_identity, conv_output)) return; - - if (HasFusedActivation(conv_op)) return; - - if (!pass_->IsCompat(subgraph, graph)) { - LOG(WARNING) - << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; - return; - } - - conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()}); - conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); - conv_op->Op()->SetAttr("fuse_residual_connection", true); - - GraphSafeRemoveNodes(graph, {conv_output, elementwise_add_op}); - - IR_NODE_LINK_TO(elementwise_add_identity, conv_op); - IR_NODE_LINK_TO(conv_op, elementwise_add_out); - - (*fusion_stats)++; -} - -ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::ProjectionFuseHandle( - const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func, - const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc& - get_node_from_conv_x_op, - const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc& - get_node_from_conv_y_op, - const ResidualConnectionMKLDNNFusePass::ProjectionElementwiseAddFunc& - get_node_from_elementwise_add_op, - const ResidualConnectionMKLDNNFusePass* pass) - : fusion_stats{std::make_shared(0)}, - can_fuse_func{can_fuse_func}, - get_node_from_conv_x_op{get_node_from_conv_x_op}, - get_node_from_conv_y_op{get_node_from_conv_y_op}, - get_node_from_elementwise_add_op{get_node_from_elementwise_add_op}, - pass_{pass} {} - -void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()( - const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { - Node* conv_x_op; - Node* conv_x_input; - Node* conv_x_filter; - Node* conv_x_output; - - Node* conv_y_op; - Node* conv_y_input; - Node* conv_y_filter; - Node* conv_y_output; - - Node* elementwise_add_op; - Node* elementwise_add_out; - - if (!pass_->IsCompat(subgraph, graph)) { - LOG(WARNING) - << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; - return; - } - - std::tie(conv_x_op, conv_x_input, conv_x_filter, conv_x_output) = - get_node_from_conv_x_op(subgraph); - std::tie(conv_y_op, conv_y_input, conv_y_filter, conv_y_output) = - get_node_from_conv_y_op(subgraph); - std::tie(elementwise_add_op, elementwise_add_out) = - get_node_from_elementwise_add_op(subgraph); - - if (!can_fuse_func(conv_x_op, elementwise_add_op)) return; - if (!can_fuse_func(conv_y_op, elementwise_add_op)) return; - - Node* projection_node; - Node* residual_conv_op; - Node* residual_conv_output; - - if (IsReachable(graph, conv_x_input, conv_y_output)) { - projection_node = conv_x_output; - residual_conv_op = conv_y_op; - residual_conv_output = conv_y_output; - } else if (IsReachable(graph, conv_y_input, conv_x_output)) { - projection_node = conv_y_output; - residual_conv_op = conv_x_op; - residual_conv_output = conv_x_output; - } else { - return; - } - - if (HasFusedActivation(residual_conv_op)) return; - - residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()}); - residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); - - residual_conv_op->Op()->SetAttr("fuse_residual_connection", true); - - GraphSafeRemoveNodes(graph, {residual_conv_output, elementwise_add_op}); - - IR_NODE_LINK_TO(projection_node, residual_conv_op); - IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out); - - (*fusion_stats)++; -} - -std::tuple -ResidualConnectionMKLDNNFusePass::GetNodesFromConv( - const patterns::Conv& conv_pattern, - const GraphPatternDetector::subgraph_t& subgraph) const { - GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); - - return std::make_tuple(conv_op, conv_input, conv_filter, conv_output); -} - GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( const std::string& name_scope, const GraphWithStats& graph_with_stats) const { - ir::Graph* graph; - int stats; - - std::tie(graph, stats) = graph_with_stats; - GraphPatternDetector gpd; auto pattern = gpd.mutable_pattern(); @@ -298,26 +151,56 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr())); conv_output->AsIntermediate(); - auto get_node_from_elementwise_add = [&elementwise_add_pattern]( - const GraphPatternDetector::subgraph_t& subgraph) - -> std::tuple { - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); - - return std::make_tuple(elementwise_add_op, elementwise_add_y, - elementwise_add_out); - }; - - return ExecuteHandleOnGraph( - &gpd, graph_with_stats, - [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) { - return GetNodesFromConv(conv_pattern, subgraph); - }, - get_node_from_elementwise_add, this); + int found_conv_as_x_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_identity, elementwise_add_y, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, + elementwise_add_pattern); + + if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return; + + if (!IsReachable(g, elementwise_add_identity, conv_output)) return; + + if (HasFusedActivation(conv_op)) return; + + if (!IsCompat(subgraph, g)) { + LOG(WARNING) + << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; + return; + } + + conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()}); + conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + conv_op->Op()->SetAttr("fuse_residual_connection", true); + + GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op}); + + IR_NODE_LINK_TO(elementwise_add_identity, conv_op); + IR_NODE_LINK_TO(conv_op, elementwise_add_out); + + found_conv_as_x_count++; + }; + + gpd(graph_with_stats.first, handler); + if (!Has("disable_logs") || !Get("disable_logs")) { + std::stringstream msg_ss; + msg_ss << "--- Fused " << found_conv_as_x_count + << " conv (as x) + elementwise_add patterns"; + paddle::string::PrettyLogDetail(msg_ss.str().c_str()); + } + + return std::make_pair(graph_with_stats.first, + found_conv_as_x_count + graph_with_stats.second); } GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( @@ -335,26 +218,56 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( conv_output); conv_output->AsIntermediate(); - auto get_node_from_elementwise_add = [&elementwise_add_pattern]( - const GraphPatternDetector::subgraph_t& subgraph) - -> std::tuple { - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); - - return std::make_tuple(elementwise_add_op, elementwise_add_x, - elementwise_add_out); - }; - - return ExecuteHandleOnGraph( - &gpd, graph_with_stats, - [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) { - return GetNodesFromConv(conv_pattern, subgraph); - }, - get_node_from_elementwise_add, this); + int found_conv_as_y_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, + elementwise_add_pattern); + + if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return; + + if (!IsReachable(g, elementwise_add_x, conv_output)) return; + + if (HasFusedActivation(conv_op)) return; + + if (!IsCompat(subgraph, g)) { + LOG(WARNING) + << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; + return; + } + + conv_op->Op()->SetInput("ResidualData", {elementwise_add_x->Name()}); + conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + conv_op->Op()->SetAttr("fuse_residual_connection", true); + + GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op}); + + IR_NODE_LINK_TO(elementwise_add_x, conv_op); + IR_NODE_LINK_TO(conv_op, elementwise_add_out); + + found_conv_as_y_count++; + }; + + gpd(graph_with_stats.first, handler); + if (!Has("disable_logs") || !Get("disable_logs")) { + std::stringstream msg_ss; + msg_ss << "--- Fused " << found_conv_as_y_count + << " conv (as y) + elementwise_add patterns"; + paddle::string::PrettyLogDetail(msg_ss.str().c_str()); + } + + return std::make_pair(graph_with_stats.first, + found_conv_as_y_count + graph_with_stats.second); } GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( @@ -374,39 +287,84 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( conv_x_output->AsIntermediate(); conv_y_output->AsIntermediate(); - auto get_node_from_elementwise_add = [&elementwise_add_pattern]( - const GraphPatternDetector::subgraph_t& subgraph) - -> std::tuple { - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); - - return std::make_tuple(elementwise_add_op, elementwise_add_out); - }; - - return ExecuteHandleOnGraph( - &gpd, graph_with_stats, - [this, - &conv_x_pattern](const GraphPatternDetector::subgraph_t& subgraph) { - return GetNodesFromConv(conv_x_pattern, subgraph); - }, - [this, - &conv_y_pattern](const GraphPatternDetector::subgraph_t& subgraph) { - return GetNodesFromConv(conv_y_pattern, subgraph); - }, - get_node_from_elementwise_add, this); + int found_projection_conv_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(conv_x_op, conv_op, conv_x_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_x_input, conv_input, conv_x_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_x_filter, conv_filter, conv_x_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_x_output, conv_output, conv_x_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(conv_y_op, conv_op, conv_y_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_y_input, conv_input, conv_y_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_y_filter, conv_filter, conv_y_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_y_output, conv_output, conv_y_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, + elementwise_add_pattern); + + if (!IsCompat(subgraph, g)) { + LOG(WARNING) + << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; + return; + } + + if (FindFuseOption(*conv_x_op, *elementwise_add_op) != FUSE_MKLDNN) return; + if (FindFuseOption(*conv_y_op, *elementwise_add_op) != FUSE_MKLDNN) return; + + Node* projection_node; + Node* residual_conv_op; + Node* residual_conv_output; + if (IsReachable(g, conv_x_input, conv_y_output)) { + projection_node = conv_x_output; + residual_conv_op = conv_y_op; + residual_conv_output = conv_y_output; + } else if (IsReachable(g, conv_y_input, conv_x_output)) { + projection_node = conv_y_output; + residual_conv_op = conv_x_op; + residual_conv_output = conv_x_output; + } else { + return; + } + + if (HasFusedActivation(residual_conv_op)) return; + + residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()}); + residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + + residual_conv_op->Op()->SetAttr("fuse_residual_connection", true); + + GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_add_op}); + + IR_NODE_LINK_TO(projection_node, residual_conv_op); + IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out); + + found_projection_conv_count++; + }; + + gpd(graph_with_stats.first, handler); + if (!Has("disable_logs") || !Get("disable_logs")) { + std::stringstream msg_ss; + msg_ss << "--- Fused " << found_projection_conv_count + << " projection conv (as y) + elementwise_add patterns"; + paddle::string::PrettyLogDetail(msg_ss.str().c_str()); + } + + return std::make_pair(graph_with_stats.first, + found_projection_conv_count + graph_with_stats.second); } -void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const { +void ResidualConnectionMKLDNNFusePass::ApplyImpl(ir::Graph* graph) const { FusePassBase::Init(name_scope_, graph); - auto fused_graph_with_stats = FuseConvAsY( - name_scope_, - FuseConvAsX(name_scope_, - FuseProjectionConv(name_scope_, std::make_pair(graph, 0)))); + auto graph_with_stats = + FuseProjectionConv(name_scope_, std::make_pair(graph, 0)); + graph_with_stats = FuseConvAsX(name_scope_, graph_with_stats); + graph_with_stats = FuseConvAsY(name_scope_, graph_with_stats); - LOG(INFO) << "Fused graph " << fused_graph_with_stats.second << "\n"; - AddStatis(fused_graph_with_stats.second); + AddStatis(graph_with_stats.second); } } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h index c83335da2f629c128fcf4819b2645ab1ef5eae42..c4351b382187d9062a059d013ddb237520645b6d 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h @@ -28,19 +28,9 @@ namespace paddle { namespace framework { namespace ir { -class Graph; -class GraphPatternDetector; -class Node; -namespace patterns { -struct Conv; -} // namespace patterns - -using graph_ptr = ir::Graph*; using GraphWithStats = std::pair; -void CorrectGraphEdges(Graph* graph, Node* from, Node* to); bool IsReachable(ir::Graph* graph, Node* from, Node* to); -paddle::optional HasBias(const Node& op, const std::string& bias_name); class ResidualConnectionMKLDNNFusePass : public FusePassBase { private: @@ -52,91 +42,13 @@ class ResidualConnectionMKLDNNFusePass : public FusePassBase { const std::string& name_scope, const GraphWithStats& graph_with_stats) const; - template - using GetNodeFunc = - std::function; - using IdentityConvFunc = GetNodeFunc>; - using IdentityElementwiseAddFunc = - GetNodeFunc>; - - using ProjectionConvFunc = IdentityConvFunc; - using ProjectionElementwiseAddFunc = GetNodeFunc>; - - using CanFuseFunc = std::function; - - std::tuple GetNodesFromConv( - const patterns::Conv& conv_pattern, - const GraphPatternDetector::subgraph_t& subgraph) const; - - std::tuple GetNodesFromProjectionConv( - const patterns::Conv& conv_pattern, - const GraphPatternDetector::subgraph_t& subgraph) const; - - template - GraphWithStats ExecuteHandleOnGraph(GraphPatternDetector* gpd, - const GraphWithStats& graph_with_stats, - OpFuncs&&... op_funcs) const { - ir::Graph* graph; - int stats; - - std::tie(graph, stats) = graph_with_stats; - - auto can_fuse = [this](Node* op1, Node* op2) -> bool { - return this->FindFuseOption(*op1, *op2) == FUSE_MKLDNN; - }; - auto fuse_handle = HandleType{can_fuse, std::forward(op_funcs)...}; - - (*gpd)(graph, fuse_handle); - - return std::make_pair(graph, stats + fuse_handle.get_stats()); - } - - struct IdentityFuseHandle { - IdentityFuseHandle( - const CanFuseFunc& can_fuse_func, - const IdentityConvFunc& get_node_from_conv_op, - const IdentityElementwiseAddFunc& get_node_from_elementwise_add_op, - const ResidualConnectionMKLDNNFusePass* pass); - - void operator()(const GraphPatternDetector::subgraph_t& subgraph, - Graph* graph); - int get_stats() const { return *fusion_stats; } - - private: - std::shared_ptr fusion_stats; - CanFuseFunc can_fuse_func; - IdentityConvFunc get_node_from_conv_op; - IdentityElementwiseAddFunc get_node_from_elementwise_add_op; - const ResidualConnectionMKLDNNFusePass* pass_; - }; - - struct ProjectionFuseHandle { - ProjectionFuseHandle( - const CanFuseFunc& can_fuse_func, - const ProjectionConvFunc& get_node_from_conv_x_op, - const ProjectionConvFunc& get_node_from_conv_y_op, - const ProjectionElementwiseAddFunc& get_node_from_elementwise_add_op, - const ResidualConnectionMKLDNNFusePass* pass); - - void operator()(const GraphPatternDetector::subgraph_t& subgraph, - Graph* graph); - int get_stats() const { return *fusion_stats; } - - private: - std::shared_ptr fusion_stats; - CanFuseFunc can_fuse_func; - ProjectionConvFunc get_node_from_conv_x_op; - ProjectionConvFunc get_node_from_conv_y_op; - ProjectionElementwiseAddFunc get_node_from_elementwise_add_op; - const ResidualConnectionMKLDNNFusePass* pass_; - }; - public: ResidualConnectionMKLDNNFusePass(); virtual ~ResidualConnectionMKLDNNFusePass() {} protected: - void ApplyImpl(graph_ptr graph) const; + void ApplyImpl(ir::Graph* graph) const; + static bool HasFusedActivation(Node* conv_node) { return !(conv_node->Op() ->GetAttrIfExists("fuse_activation") diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc index 96aa95bde337436dd6eb584b3eea5395b5301a34..11190309814e7c75777a6cddd7e4d24bfc7ba9e6 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include +#include -#include #include -#include -#include + +#include "gtest/gtest.h" #include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" @@ -25,7 +26,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/place.h" -USE_OP(batch_norm); +USE_OP_ITSELF(batch_norm); USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN); USE_OP(conv2d_transpose); USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN); diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index 0a95444f852dd0abdd150d04dc7536e26151c218..d578ada0db00fed85f7b4f25f1483169c72c2c0b 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -15,8 +15,9 @@ #include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h" #include -#include #include + +#include #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/op_registry.h" @@ -27,7 +28,7 @@ USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP(leaky_relu); USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN); USE_OP(gelu); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP(tanh); USE_OP_DEVICE_KERNEL(tanh, MKLDNN); diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index 2c3359ffa8e46f0d30a01d73fccb95d88771480a..219aae71127ed8963b4bfe4e8ee5e7259dbf7d02 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -37,7 +37,7 @@ USE_OP(elementwise_mul); USE_OP(softmax_with_cross_entropy); USE_OP_ITSELF(reduce_mean); USE_OP_ITSELF(reduce_sum); -USE_OP(reduce_sum_grad); +USE_OP_ITSELF(reduce_sum_grad); USE_OP(reduce_mean_grad); USE_OP_ITSELF(reshape2_grad); USE_OP(softmax_with_cross_entropy_grad); @@ -46,7 +46,7 @@ USE_OP(matmul_grad); USE_OP(square); USE_OP(transpose2_grad); USE_OP(concat_grad); -USE_OP(elementwise_mul_grad); +USE_OP_ITSELF(elementwise_mul_grad); USE_OP(sigmoid_grad); USE_OP(tanh_grad); USE_OP(sum); @@ -54,7 +54,7 @@ USE_OP(slice_grad); USE_OP(lookup_table_grad); USE_OP(sqrt); USE_OP(elementwise_max); -USE_OP(elementwise_div); +USE_OP_ITSELF(elementwise_div); USE_OP(sgd); USE_OP(squared_l2_norm); USE_OP(memcpy_h2d); diff --git a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h index 7b3916bafc93eda8cb1afbf54b706e032c5233dd..bc65231abe7371a931f709c9190b55fde24f0543 100644 --- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h +++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h @@ -409,7 +409,7 @@ class ThreadPoolTempl { return false; } platform::RecordEvent("SleepWaitForWork", - platform::TracerEventType::UserDefined, 2); + platform::TracerEventType::UserDefined, 10); ec_.CommitWait(waiter); blocked_--; return true; diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index c45bf32d8b710cb35ec5f86a4a8ba2e1078537e6..eb40a49b4066a7a8c8e9c142a310b815fd73da20 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -286,8 +286,8 @@ struct OpKernelRegistrarFunctorEx, \ paddle::framework::EmptyGradOpMaker) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index d33791f70c4d2f759bcd4f6443a5a1f244673d4f..f8e30c1ee294ecf692e2992b6123232ba1c8bd7d 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -254,7 +254,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { "reinstall Paddle with CustomDevice support.", place)); #else - platform::DeviceManager::SetDevice(place); + phi::DeviceManager::SetDevice(place); #endif } @@ -264,10 +264,10 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { // and different op name cost time,we set two event. platform::RecordEvent op_type_record_event( Type(), platform::TracerEventType::Operator, 1); - // auto op_name = platform::OpName(outputs_, Type()); - // platform::RecordEvent op_name_record_event( - // op_name, platform::TracerEventType::Operator, 1, - // platform::EventRole::kUniqueOp); + auto op_name = platform::OpName(outputs_, Type()); + platform::RecordEvent op_name_record_event( + op_name, platform::TracerEventType::Operator, 10, + platform::EventRole::kUniqueOp); RunImpl(scope, place); } @@ -539,6 +539,20 @@ bool ExecutionContext::HasInput(const std::string& name) const { return var != nullptr; } +bool ExecutionContext::HasInputs(const std::string& name) const { + const auto& ins = ctx_.inputs; + auto it = ins.find(name); + if (it == ins.end() || it->second.empty()) { + return false; + } + for (const auto* input : it->second) { + if (input == nullptr) { + return false; + } + } + return true; +} + bool ExecutionContext::HasOutput(const std::string& name) const { auto* var = OutputVar(name); return var != nullptr; @@ -1210,6 +1224,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope, VLOG(6) << "Static mode ChoosePhiKernel - kernel `" << pt_kernel_name << "` not found."; } + } else { + pt_kernel_name = pt_kernel_signature_->name; + pt_kernel_key = TransOpKernelTypeToPhiKernelKey(*kernel_type_.get()); } #ifdef PADDLE_WITH_XPU bool is_xpu_unsupport = @@ -2048,7 +2065,11 @@ void OperatorWithKernel::BuildPhiKernelContext( // deal with optional here if ((it == ctx.inputs.end() || it->second.size() == 0) && (input_defs[i].type_index == - std::type_index(typeid(paddle::optional)))) { + std::type_index( + typeid(paddle::optional)) || + input_defs[i].type_index == + std::type_index( + typeid(paddle::optional)))) { pt_kernel_context->EmplaceBackInputWithoutSetRange(nullptr); auto end_idx = start_idx + 1; pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), @@ -2074,6 +2095,7 @@ void OperatorWithKernel::BuildPhiKernelContext( } pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i); } + VLOG(4) << "Done inputs"; for (size_t i = 0; i < output_names.size(); ++i) { auto it = ctx.outputs.find(output_names[i]); @@ -2098,26 +2120,25 @@ void OperatorWithKernel::BuildPhiKernelContext( for (size_t offset = 0; offset < outs_vector.size(); ++offset) { phi::TensorBase* tensor_out = nullptr; auto* var = outs_vector[offset]; - if (var->template IsType()) { - tensor_out = var->template GetMutable(); - } else if (var->template IsType()) { - tensor_out = var->template GetMutable(); - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported output `%s` type when call pt kernel.", - framework::ToTypeName(var->Type()))); - } - experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out, - output_defs.at(i)); - SetAllocationForOutputTenosr( - tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend)); + if (var) { + if (var->template IsType()) { + tensor_out = var->template GetMutable(); + } else if (var->template IsType()) { + tensor_out = var->template GetMutable(); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported output `%s` type when call pt kernel.", + framework::ToTypeName(var->Type()))); + } + } pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); } pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i); } + VLOG(4) << "Done outputs"; for (size_t i = 0; i < attr_names.size(); ++i) { if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) { @@ -2182,6 +2203,51 @@ void OperatorWithKernel::BuildPhiKernelContext( std::move(experimental::MakePhiScalarFromVar(*ins_vector.front()))); } + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + auto& attr = Attrs().at(attr_names[i]); + if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + pt_kernel_context->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + pt_kernel_context->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + pt_kernel_context->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + pt_kernel_context->EmplaceBackAttr(std::move(scalar_list)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported cast op attribute `%s` to vector when " + "construct KernelContext.", + attr_names[i])); + } } else { // TODO(chenweihang): support other attrs later auto& attr = Attrs().at(attr_names[i]); @@ -2205,15 +2271,17 @@ void OperatorWithKernel::BuildPhiKernelContext( } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { if (std::type_index(attr.type()) == - std::type_index(typeid(std::vector))) { + std::type_index(typeid(std::vector))) { + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { // Emplace Back Attr according to the type of Phi_Kernel args. const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); const std::vector vector_int64_attr(vector_int_attr.begin(), vector_int_attr.end()); pt_kernel_context->EmplaceBackAttr(vector_int64_attr); } - // TODO(YuanRisheng) Need support vector attr - } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); @@ -2226,6 +2294,7 @@ void OperatorWithKernel::BuildPhiKernelContext( } } } + VLOG(4) << "Done attributes"; } } // namespace framework diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 16718a316513e3574e9a7eb14ed50106c8b0dcb6..1a1171f1dba4d794796ef1421fe386f60a0e587d 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -295,6 +295,8 @@ class ExecutionContext { virtual bool HasInput(const std::string& name) const; + virtual bool HasInputs(const std::string& name) const; + virtual bool HasOutput(const std::string& name) const; virtual size_t InputSize(const std::string& name) const { @@ -449,7 +451,7 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext { : ctx_(ctx) {} bool HasInput(const std::string& name) const override { - return ctx_.HasInput(name); + return ctx_.HasInputs(name); } bool HasOutput(const std::string& name) const override { @@ -489,6 +491,8 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext { return ctx_.OutputVar(name)->IsType(); } + bool IsForInferShape() const override { return false; } + private: const ExecutionContext& ctx_; }; diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc index bf9d1baaf394f05d125563311dd2047383373834..47dffd47b7cbbf4a37e6715b40d41024330bc679 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc @@ -675,7 +675,7 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) { USE_PASS(build_cinn_pass); USE_OP(mul); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_ITSELF(elementwise_add); -USE_OP(relu_grad); +USE_OP_ITSELF(relu_grad); USE_OP_ITSELF(elementwise_add_grad); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc index 706815185a1b5b53d1bb8e26274206abc126cfd5..c015e90f71e54691e92c3a36c3d6e053372f64f3 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc @@ -241,7 +241,6 @@ std::unique_ptr CinnCompiler::CompileGraph( std::make_unique(target, scope, cinn_graph); GraphCompiler::CompileOptions options; options.with_instantiate_variables = false; - options.with_buffer_handle_instruction_inserted = true; auto compiled_res = graph_compiler->Build(options, std::move(fetch_ids), stream); auto compiled_obj = std::make_unique(); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc index e8badab27b9b97aade81bf496ce211485f924757..cdccc4c5546900a141a084281f419c2940b23817 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc @@ -301,5 +301,5 @@ TEST(CinnCompilerTest, Compile) { USE_PASS(build_cinn_pass); USE_PASS(graph_viz_pass); USE_OP(mul); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_ITSELF(elementwise_add); diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc index 355291beb60f949b52b681592d42b7da4e80186b..14997dd9610138e32a45ef17abc9276cd1dad172 100644 --- a/paddle/fluid/framework/phi_utils.cc +++ b/paddle/fluid/framework/phi_utils.cc @@ -68,6 +68,8 @@ OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) { library_type = LibraryType::kMKLDNN; } else if (kernel_key.backend() == phi::Backend::GPUDNN) { library_type = LibraryType::kCUDNN; + } else if (kernel_key.backend() == phi::Backend::KPS) { + library_type = LibraryType::kKP; } else { // do nothing } @@ -82,6 +84,8 @@ phi::KernelKey TransOpKernelTypeToPhiKernelKey( backend = phi::Backend::MKLDNN; } else if (kernel_type.library_type_ == LibraryType::kCUDNN) { backend = phi::Backend::GPUDNN; + } else if (kernel_type.library_type_ == LibraryType::kKP) { + backend = phi::Backend::KPS; } else { // do } @@ -121,6 +125,15 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key, return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); } +#endif +#ifdef PADDLE_WITH_IPU + if (platform::is_ipu_place(expected_kernel_key.place_)) { + VLOG(3) << "pten missing IPU kernel: " << op.Type() + << ", expected_kernel_key:" << expected_kernel_key + << ", fallbacking to CPU one!"; + return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(), + kernel_key.dtype()); + } #endif return phi::KernelKey(); } @@ -229,26 +242,5 @@ static void SetAllocationForUninitializedDenseTensor( dense_tensor->ResetHolder(shared_allocation); } -void SetAllocationForOutputTenosr(phi::TensorBase* tensor, - const platform::Place& place) { - if (phi::DenseTensor::classof(tensor)) { - auto* dense_tensor = static_cast(tensor); - if (!dense_tensor->IsInitialized() || !(dense_tensor->place() == place)) { - SetAllocationForUninitializedDenseTensor(dense_tensor, place); - } - } else if (phi::SelectedRows::classof(tensor)) { - auto* selected_rows = static_cast(tensor); - if (!selected_rows->value().IsInitialized() || - !(selected_rows->place() == place)) { - SetAllocationForUninitializedDenseTensor(selected_rows->mutable_value(), - place); - } - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported tensor type is received when setting allocation for " - "output tensor.")); - } -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h index 1a1f79d82770058ae4010b7a3a3162280ceb1537..a17578816921b2337a76d1a0a69a6c8adbc51c4d 100644 --- a/paddle/fluid/framework/phi_utils.h +++ b/paddle/fluid/framework/phi_utils.h @@ -62,9 +62,6 @@ class KernelArgsNameMaker { void InitDefaultKernelSignatureMap(); -void SetAllocationForOutputTenosr(phi::TensorBase* tensor, - const platform::Place& place); - // TODO(Wilber): support others device context. template struct ConvertToPhiContext { diff --git a/paddle/fluid/framework/ps.proto b/paddle/fluid/framework/ps.proto new file mode 100755 index 0000000000000000000000000000000000000000..0ae87812bce434be5e664aefea4bba19ae147d28 --- /dev/null +++ b/paddle/fluid/framework/ps.proto @@ -0,0 +1,213 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; +package paddle.distributed; +option cc_generic_services = true; +option cc_enable_arenas = true; + +message FsClientParameter { + enum FsApiType { + HDFS = 0; + AFS = 1; + } + optional FsApiType fs_type = 1 [ default = HDFS ]; + optional string uri = 2; // such as afs://xxx.afs.com:9902 + optional string user = 3; // user_name to access fs + optional string passwd = 4; // password + optional int32 buffer_size = 5; // buffer for read/write + optional string hadoop_bin = 51; + optional string afs_conf = 101; +} + +message PSParameter { + optional string worker_class = 1; + optional string server_class = 2; + optional string instance_class = 3; + optional string init_gflags = 4 [ default = "" ]; + optional WorkerParameter worker_param = 101; + optional ServerParameter server_param = 102; + repeated DownpourTrainerParameter trainer_param = 301; + optional FsClientParameter fs_client_param = 501; +} + +message WorkerParameter { + optional DownpourWorkerParameter downpour_worker_param = 1; +} + +message DownpourWorkerParameter { + repeated TableParameter downpour_table_param = 1; +} + +message DownpourServerParameter { + repeated TableParameter downpour_table_param = 1; + optional ServerServiceParameter service_param = 2; +} + +message ServerParameter { + optional DownpourServerParameter downpour_server_param = 1; +} + +message DownpourTrainerParameter { + repeated DenseTableParameter dense_table = 1; + repeated SparseTableParameter sparse_table = 2; + optional int32 push_sparse_per_batch = 3; + optional int32 push_dense_per_batch = 4; + repeated string skip_op = 5; + repeated ProgramConfig program_config = 6; +} + +message DenseTableParameter { + optional int32 table_id = 1; + repeated string dense_variable_name = 2; + repeated string dense_gradient_variable_name = 3; + optional int32 fea_dim = 4; +} + +message SparseTableParameter { + optional int32 table_id = 1; + optional int32 feature_dim = 2; + repeated string slot_key = 3; + repeated string slot_value = 4; + repeated string slot_gradient = 5; +} + +message ServerServiceParameter { + optional string server_class = 1 [ default = "BrpcPsServer" ]; + optional string client_class = 2 [ default = "BrpcPsClient" ]; + optional string service_class = 3 [ default = "BrpcPsService" ]; + optional uint32 start_server_port = 4 + [ default = 0 ]; // will find a avaliable port from it + optional uint32 server_thread_num = 5 [ default = 12 ]; +} + +message ProgramConfig { + required string program_id = 1; + repeated int32 push_sparse_table_id = 2; + repeated int32 push_dense_table_id = 3; + repeated int32 pull_sparse_table_id = 4; + repeated int32 pull_dense_table_id = 5; +} + +enum TableType { + PS_SPARSE_TABLE = 0; + PS_DENSE_TABLE = 1; + PS_OTHER_TABLE = 2; +} + +message TableParameter { + optional uint64 table_id = 1; + optional string table_class = 2; + optional uint64 shard_num = 3 [ default = 1000 ]; + optional TableAccessorParameter accessor = 4; + optional TensorAccessorParameter tensor = 5; + optional CommonAccessorParameter common = 6; + optional TableType type = 7; + optional bool compress_in_save = 8 [ default = false ]; +} + +message TableAccessorParameter { + optional string accessor_class = 1; + optional uint32 fea_dim = 4 [ default = 11 ]; + optional uint32 embedx_dim = 5 [ default = 8 ]; + optional uint32 embedx_threshold = 6 [ default = 10 ]; + optional CtrAccessorParameter ctr_accessor_param = 7; + repeated TableAccessorSaveParameter table_accessor_save_param = 8; + optional SparseCommonSGDRuleParameter embed_sgd_param = 10; + optional SparseCommonSGDRuleParameter embedx_sgd_param = 11; +} + +message CtrAccessorParameter { + optional float nonclk_coeff = 1 + [ default = 0.1 ]; // to calculate show_click_score + optional float click_coeff = 2 + [ default = 1 ]; // to calculate show_click_score + optional float base_threshold = 3 [ + default = 1.5 + ]; // show_click_score > base_threshold, this feature can be saved + optional float delta_threshold = 4 + [ default = + 0.25 ]; // delta_score > delta_threshold, this feature can be saved + optional float delta_keep_days = 5 + [ default = + 16 ]; // unseen_day < delta_keep_days, this feature can be saved + optional float show_click_decay_rate = 6 [ + default = 0.98 + ]; // show/click will update to show/click * show_click_decay_rate after a day + optional float delete_threshold = 7 + [ default = 0.8 ]; // threshold to shrink a feasign + optional float delete_after_unseen_days = 8 + [ default = 30 ]; // unseen_day > delete_after_unseen_days, this feature + // will be delete in shrink_model + optional int32 ssd_unseenday_threshold = 9 + [ default = 1 ]; // threshold to save ssd +} + +message TensorAccessorParameter { + optional string feed_var_name = 1; + optional string fetch_var_name = 2; + optional int64 startup_program_id = 3; + optional int64 main_program_id = 4; + optional string tensor_table_class = 6; +} + +message CommonAccessorParameter { + optional string name = 1; + optional string table_name = 2; + repeated string attributes = 3; + repeated string params = 4; + repeated uint32 dims = 5; + repeated string initializers = 6; + optional string entry = 7; + optional int32 trainer_num = 8; + optional bool sync = 9; + optional uint32 table_num = 10; + optional uint32 table_dim = 11; +} + +message TableAccessorSaveParameter { + optional uint32 param = 1; + optional string converter = 2; + optional string deconverter = 3; +} + +message SparseCommonSGDRuleParameter { + optional string name = 1; + optional SparseNaiveSGDRuleParameter naive = 2; + optional SparseAdagradSGDRuleParameter adagrad = 3; + optional SparseAdamSGDParameter adam = 4; +} + +message SparseNaiveSGDRuleParameter { // SparseNaiveSGDRule + optional double learning_rate = 1 [ default = 0.05 ]; + optional double initial_range = 2 [ default = 0.0001 ]; + repeated float weight_bounds = 3; +} + +message + SparseAdagradSGDRuleParameter { // SparseAdaGradSGDRule|StdAdaGradSGDRule + optional double learning_rate = 1 [ default = 0.05 ]; + optional double initial_g2sum = 2 [ default = 3.0 ]; + optional double initial_range = 3 [ default = 0.0001 ]; + repeated float weight_bounds = 4; +} + +message SparseAdamSGDParameter { // SparseAdamSGDRule + optional double learning_rate = 1 [ default = 0.001 ]; + optional double initial_range = 2 [ default = 0.0001 ]; + optional double beta1_decay_rate = 3 [ default = 0.9 ]; + optional double beta2_decay_rate = 4 [ default = 0.999 ]; + optional double ada_epsilon = 5 [ default = 1e-08 ]; + repeated float weight_bounds = 6; +} diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index f198919b0c87bb4f2ea9991e401a8242676d3f46..3d8a5ab21f00fcc4137d177b741023a827e325d7 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -33,6 +33,7 @@ if(NOT WIN32) endif() if(WITH_CNCL) cc_library(cncl_context SRCS cncl_context.cc DEPS collective_helper device_context tensor var_type_traits) + cc_library(reducer SRCS reducer.cc DEPS layer) endif() if(WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL) cc_library(heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits) @@ -41,13 +42,17 @@ if(NOT WIN32) endif(NOT WIN32) if(WITH_GLOO) cc_library(imperative_gloo_context SRCS gloo_context.cc DEPS collective_helper device_context tensor var_type_traits) - if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL) )) + if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL OR WITH_CNCL) )) cc_library(reducer SRCS reducer.cc DEPS layer) endif() endif() +if(WITH_MLU) + SET(MLU_DEPS mlu_baseop) +endif() + if(NOT WITH_ASCEND_CL) -cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function phi_tensor) +cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function phi_tensor ${MLU_DEPS}) else() cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner phi_tensor) endif() diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc index 8373c7fe50d0222d6b38a400e82239dc8c3590ad..7416d206fc43eaf5a56c3eb606bb0672d1172c0b 100644 --- a/paddle/fluid/imperative/basic_engine.cc +++ b/paddle/fluid/imperative/basic_engine.cc @@ -317,6 +317,7 @@ static std::shared_ptr> CallGradientHooks( auto tmp_var = var; for (const auto& hook_pair : var->GetVariableWrapperHooks()) { tmp_var = (*hook_pair.second)(tmp_var); + CheckVar(var, tmp_var); } (*tmp_ins_ptr)[pair.first][i] = tmp_var; } diff --git a/paddle/fluid/imperative/execution_context.h b/paddle/fluid/imperative/execution_context.h index fe5ac73b0046915c4a52087ed792925b0b0ed200..fbc47f81fd33169f54aeb2c251f9b6c90cb44637 100644 --- a/paddle/fluid/imperative/execution_context.h +++ b/paddle/fluid/imperative/execution_context.h @@ -133,6 +133,11 @@ class DygraphExecutionContext : public framework::ExecutionContext { return (it != var_map_in_.end() && it->second.size() > 0); } + bool HasInputs(const std::string& name) const override { + auto it = var_map_in_.find(name); + return (it != var_map_in_.end() && it->second.size() > 0); + } + bool HasOutput(const std::string& name) const override { auto it = var_map_out_.find(name); return (it != var_map_out_.end() && it->second.size() > 0); diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 0abc5ad90e2697eb78ff1e21ceb2bc0e97e14a44..12aa13bbacc3bae5d690323f45817f95762c376c 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -732,6 +732,7 @@ void GradientAccumulator::CallGradientHooks() { << var_->GetVariableWrapperHooks().size(); for (const auto& hook_pair : var_->GetVariableWrapperHooks()) { tmp_var = (*hook_pair.second)(tmp_var); + CheckVar(inner_var_, tmp_var); } inner_var_ = tmp_var; } diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h index e74711c2a796576d55e06cdfb59efa074324a71f..03f6775defc2f8fccba0654ae5d366d66ad88fc0 100644 --- a/paddle/fluid/imperative/gradient_accumulator.h +++ b/paddle/fluid/imperative/gradient_accumulator.h @@ -179,5 +179,29 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var, template void TensorAdd(const VarType& src, VarType* dst); +inline void CheckVar(const std::shared_ptr& pre, + const std::shared_ptr& post) { + if (pre->IsEmpty() && !post->IsEmpty()) { + PADDLE_THROW(platform::errors::PermissionDenied( + "The tensor(%s) in before and after hook are not consistent", + pre->Name())); + } + if (!pre->IsEmpty() && !post->IsEmpty()) { + VLOG(4) << pre->DataType() << " " << post->DataType(); + PADDLE_ENFORCE_EQ( + pre->DataType(), post->DataType(), + platform::errors::PermissionDenied( + "The dtype of tensor(%s) before(%s) and after(%s) hook are not " + "consistent", + pre->Name(), framework::DataTypeToString(pre->DataType()), + framework::DataTypeToString(post->DataType()))); + PADDLE_ENFORCE_EQ(pre->Place(), post->Place(), + platform::errors::PermissionDenied( + "The place of tensor(%s) before(%s) and after(%s) " + "hook are not consistent", + pre->Name(), pre->Place(), post->Place())); + } +} + } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 9dd1dacc02c25474803ef3177d9cd967ee681714..bae49fb381a475dd8227d1dc855a6db28c9cd273 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -186,11 +186,10 @@ PreparedOp PrepareImpl(const NameVarMap& ins, << " | kernel key: " << pt_kernel_key << " | kernel: " << pt_kernel; - if (platform::is_cpu_place(expected_kernel_key.place_)) { - auto* cpu_ctx = pool.Get(paddle::platform::CPUPlace()); - return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature, - pt_kernel, cpu_ctx); + if (expected_kernel_key.place_ != place) { + dev_ctx = pool.Get(expected_kernel_key.place_); } + // TODO(chenweihang): using CPUKernel when miss device kernel case return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature, pt_kernel, dev_ctx); @@ -248,6 +247,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, #endif #ifdef PADDLE_WITH_XPU_KP + expected_kernel_key.place_ = platform::XPUPlace(); bool use_xpu_kp_kernel_rt = FLAGS_run_kp_kernel && paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key); diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 8e1e2fbe9a12da672a633075ed4c41d3d62cd7e1..d7c0c8cc547e6b04f67ddbb06121d139756d5142 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -314,27 +314,25 @@ void BuildDygraphPhiKernelContext( phi::TensorBase* tensor_out = nullptr; auto* var = outs_vector[offset]->MutableVar(); - if (var->template IsType()) { - tensor_out = var->template GetMutable(); - } else if (var->template IsType()) { - tensor_out = var->template GetMutable(); - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported output `%s` type when call pt kernel.", - framework::ToTypeName(var->Type()))); + if (var) { + if (var->template IsType()) { + tensor_out = var->template GetMutable(); + } else if (var->template IsType()) { + tensor_out = var->template GetMutable(); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported output `%s` type when call pt kernel.", + framework::ToTypeName(var->Type()))); + } } - experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out, - output_defs.at(i)); - framework::SetAllocationForOutputTenosr( - tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend)); - kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); } kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i); } for (size_t i = 0; i < attr_names.size(); ++i) { + VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i]; if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) { if (attrs.find(attr_names[i]) != attrs.end()) { // shape is in the attribute @@ -412,6 +410,60 @@ void BuildDygraphPhiKernelContext( experimental::MakePhiScalarFromVar(ins_vector[0]->Var()))); } + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + auto& attr = GetAttr(attrs, default_attrs, attr_names[i]); + if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported cast op attribute `%s` to vector when " + "construct KernelContext.", + attr_names[i])); + } } else { // TODO(chenweihang): support other attrs later auto& attr = GetAttr(attrs, default_attrs, attr_names[i]); @@ -435,7 +487,11 @@ void BuildDygraphPhiKernelContext( } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { if (std::type_index(attr.type()) == - std::type_index(typeid(std::vector))) { + std::type_index(typeid(std::vector))) { + kernel_ctx->EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { // Emplace Back Attr according to the type of Phi_Kernel args. const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); const std::vector vector_int64_attr(vector_int_attr.begin(), diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 3a6365b2af21ae9012fe37293699caed9bb23855..fec9afbf3b403ca2fd45633326c7f7dec46e1243 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -31,7 +31,7 @@ namespace imperative { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ - defined(PADDLE_WITH_ASCEND_CL) + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL) // div the nranks void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { framework::Tensor *tensor = @@ -67,6 +67,9 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { #ifdef PADDLE_WITH_XPU_BKCL // TODO(liuyuhui) support xpu about div nranks in the future #endif + } else if (platform::is_mlu_place(tensor->place())) { + // TODO(zhangna) + VLOG(4) << "divnrank for mlu not support yet"; } } @@ -222,6 +225,56 @@ void SplitTensorsWithType( } #endif +#ifdef PADDLE_WITH_CNCL +// context is used to select the stream for concat +template <> +void ConcatTensorsWithType( + const platform::MLUDeviceContext &context, + const std::vector &dense_tensors_, + framework::Variable *p_dense_contents, + framework::proto::VarType::Type type) { + switch (type) { + case framework::proto::VarType::FP16: + ConcatTensorsForAllReduce( + context, dense_tensors_, p_dense_contents); + break; + case framework::proto::VarType::FP32: + ConcatTensorsForAllReduce( + context, dense_tensors_, p_dense_contents); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it concats tensors for " + "allreduce.", + framework::DataTypeToString(type))); + } +} + +// context is used to select the stream for split +template <> +void SplitTensorsWithType( + const platform::MLUDeviceContext &context, + framework::Variable *p_dense_contents, + std::vector *p_dense_tensors, + framework::proto::VarType::Type type) { + switch (type) { + case framework::proto::VarType::FP16: + SplitTensorsForAllReduce( + context, p_dense_contents, p_dense_tensors); + break; + case framework::proto::VarType::FP32: + SplitTensorsForAllReduce( + context, p_dense_contents, p_dense_tensors); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it splits tensors for " + "allreduce.", + framework::DataTypeToString(type))); + } +} +#endif + void Group::ConcatTensors(const platform::DeviceContext &context) { auto place = context.GetPlace(); if (platform::is_gpu_place(place)) { @@ -253,6 +306,16 @@ void Group::ConcatTensors(const platform::DeviceContext &context) { PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't concat npu grads since it's not compiled with HCCL," "Please recompile or reinstall Paddle with HCCL support.")); +#endif + } else if (platform::is_mlu_place(place)) { +#ifdef PADDLE_WITH_CNCL + ConcatTensorsWithType( + static_cast(context), + dense_tensors_, &dense_contents_, dtype_); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't concat mlu grads since it's not compiled with CNCL," + "Please recompile or reinstall Paddle with CNCL support.")); #endif } else if (platform::is_cpu_place(place)) { ConcatTensorsWithType( @@ -295,6 +358,16 @@ void Group::SplitTensors(const platform::DeviceContext &context) { PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't split npu grad since it's not compiled with HCCL," "Please recompile or reinstall Paddle with HCCL support.")); +#endif + } else if (platform::is_mlu_place(place)) { +#ifdef PADDLE_WITH_CNCL + SplitTensorsWithType( + static_cast(context), + &dense_contents_, &dense_tensors_, dtype_); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't split mlu grad since it's not compiled with CNCL," + "Please recompile or reinstall Paddle with CNCL support.")); #endif } else if (platform::is_cpu_place(place)) { SplitTensorsWithType( @@ -746,6 +819,11 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) { // TODO(liuyuhui) support XPU set constant VLOG(3) << "XPU doesn't support set_constant"; } +#elif defined(PADDLE_WITH_CNCL) + if (platform::is_mlu_place(group_tensor.place())) { + // TODO(liuyuhui) support MLU set constant + VLOG(3) << "MLU doesn't support set_constant"; + } #else auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_); if (HasGrad(var_index)) { @@ -846,12 +924,13 @@ void Reducer::MarkGroupReady(size_t group_index) { cv_.notify_all(); } }); -#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \ - defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) +#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \ + defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \ + defined(PADDLE_WITH_CNCL) FusedAllReduceSchedule(run_order, group, next_group_); #else PADDLE_THROW(platform::errors::PreconditionNotMet( - "Not compiled with BKCL or NCCL or GLOO.")); + "Not compiled with BKCL or NCCL or CNCL or GLOO.")); #endif } } diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h index cca773b840c279f05cd6bcd0ed82fda7fdd55a25..9fac4b41cbde01f365dcc603844b06c473a58843 100644 --- a/paddle/fluid/imperative/reducer.h +++ b/paddle/fluid/imperative/reducer.h @@ -45,7 +45,7 @@ namespace imperative { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ - defined(PADDLE_WITH_ASCEND_CL) + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL) template struct DivNRanksFunctor { diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt index e4f1cfdb3baeed9b5945b7843b6593528df48c29..09de0106ed6190c5f627ba9fb7cc038593b5088a 100644 --- a/paddle/fluid/imperative/tests/CMakeLists.txt +++ b/paddle/fluid/imperative/tests/CMakeLists.txt @@ -21,6 +21,6 @@ cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info s cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy) cc_test(test_hooks SRCS test_hooks.cc DEPS tracer basic_engine layer proto_desc operator op_registry variable_helper mul_op elementwise_add_op memcpy) cc_test(test_eager SRCS test_eager.cc DEPS tracer layer prepared_operator mul_op) -if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL) +if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_CNCL) cc_test(test_group SRCS test_group.cc DEPS reducer concat_and_split memcpy) endif() diff --git a/paddle/fluid/imperative/tests/test_group.cc b/paddle/fluid/imperative/tests/test_group.cc index 6c304278d21fde7af093b25cdd8f62a1d4528d31..5e674af1a08a87c11bfab1080be42e623661b38e 100644 --- a/paddle/fluid/imperative/tests/test_group.cc +++ b/paddle/fluid/imperative/tests/test_group.cc @@ -72,8 +72,10 @@ void GroupConcatSplit(Place place, size_t size) { value.push_back(static_cast(1.0 * j)); } - if (std::is_same::value) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (std::is_same::value || + std::is_same::value) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_CNCL) paddle::memory::Copy(place, data, cpu_place, value.data(), sizeof(T) * value.size(), 0); #endif @@ -180,5 +182,19 @@ TEST(TestGroup, TestXPUConcatSplit) { } #endif +#if defined(PADDLE_WITH_CNCL) +TEST(TestGroup, TestMLUConcatSplit) { + platform::MLUPlace mlu_place(0); + platform::CPUPlace cpu_place; + + int size = 3; + GroupConcatSplit(cpu_place, size); + GroupConcatSplit(mlu_place, size); + + size = 15; + GroupConcatSplit(cpu_place, size); + GroupConcatSplit(mlu_place, size); +} +#endif } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc index 3ac2028790608529e0745dde2ce41ed57748f46d..02a1689c23a3fe5e1543a2e52d7661d5997bc062 100644 --- a/paddle/fluid/imperative/tests/test_hooks.cc +++ b/paddle/fluid/imperative/tests/test_hooks.cc @@ -24,6 +24,10 @@ #include "paddle/fluid/imperative/hooks.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/memory/memcpy.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); namespace platform = paddle::platform; namespace framework = paddle::framework; diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc index f5ca13cb99ad3df6b9283565b5681c36f7197ae8..4cda3f32fdf3fdd2d14b201fa902c1f50f3ff98d 100644 --- a/paddle/fluid/imperative/tests/test_prepare_op.cc +++ b/paddle/fluid/imperative/tests/test_prepare_op.cc @@ -24,6 +24,13 @@ #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/imperative/prepared_operator.h" #include "paddle/fluid/imperative/type_defs.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(split, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(relu, GPU, ALL_LAYOUT); +#endif namespace imperative = paddle::imperative; namespace platform = paddle::platform; @@ -226,7 +233,7 @@ TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) { } // namespace paddle USE_OP_ITSELF(split); -USE_OP(relu); +USE_OP_ITSELF(relu); #ifdef PADDLE_WITH_MKLDNN USE_OP_DEVICE_KERNEL(relu, MKLDNN); #endif diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc index d05036f7a12ebdc3db5fbfda5eb50c295c0478e4..2e38bd77cf63cc85b75a50e62250a6e746f525bc 100644 --- a/paddle/fluid/imperative/tests/test_tracer.cc +++ b/paddle/fluid/imperative/tests/test_tracer.cc @@ -28,6 +28,13 @@ #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); +#endif namespace imperative = paddle::imperative; namespace platform = paddle::platform; @@ -591,5 +598,5 @@ TEST(test_tracer, eager_tracer) { USE_OP(mul); USE_OP(mul_grad); USE_OP_ITSELF(reduce_sum); -USE_OP(reduce_sum_grad); +USE_OP_ITSELF(reduce_sum_grad); USE_OP_ITSELF(elementwise_add); diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 85bcbd1458f24a592b646dfcda750f37f113f73f..01c9d2847e0c850fd4159613a47d647bdbf46c31 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -18,12 +18,14 @@ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/imperative/amp_auto_cast.h" +#include "paddle/fluid/imperative/execution_context.h" #include "paddle/fluid/imperative/op_base.h" #include "paddle/fluid/platform/denormal.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/phi/common/place.h" DECLARE_bool(use_mkldnn); DECLARE_string(tracer_mkldnn_ops_on); @@ -253,7 +255,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap& ins, #endif } else if (platform::is_custom_place(place)) { #ifdef PADDLE_WITH_CUSTOM_DEVICE - platform::DeviceManager::SetDevice(place); + phi::DeviceManager::SetDevice(place); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with CustomDevice if use " @@ -382,5 +384,36 @@ bool Tracer::ComputeRequiredGrad(const NameTensorMap& ins, return false; } +phi::KernelSignature Tracer::GetExpectedKernelSignature( + const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, framework::AttributeMap attrs) const { + auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false); + framework::RuntimeContext ctx({}, {}); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(phi::CPUPlace()); + const auto& op_info = op->Info(); + auto* attr_checker = op_info.Checker(); + if (attr_checker) { + attr_checker->Check(&attrs, true, /*only_check_exist_value=*/true); + } + static paddle::framework::AttributeMap empty_attrs_map = {}; + const paddle::framework::AttributeMap& default_attrs = + attr_checker == nullptr ? empty_attrs_map + : attr_checker->GetDefaultAttrMap(); + auto dygraph_exe_ctx = + imperative::DygraphExecutionContext( + *op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs, + default_attrs); + auto* opbase_with_kernel = + dynamic_cast(op.get()); + PADDLE_ENFORCE_NE(opbase_with_kernel, nullptr, + platform::errors::InvalidArgument( + "This op type:`%s` is not a OperatorWithKernel, only " + "OperatorWithKernel can get KernelSignature", + type)); + return phi::KernelSignature( + std::move(opbase_with_kernel->GetExpectedPhiKernelArgs(dygraph_exe_ctx))); +} + } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 73ecbbe6143ca8e68049c2d2886e9eee93b741f1..fd13fce6a6e17a47a7a91dfa78598a99ec22f0b7 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -28,6 +28,7 @@ #include "paddle/fluid/imperative/jit/program_desc_tracer.h" #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/phi/core/compat/arg_map_context.h" namespace paddle { namespace imperative { @@ -154,6 +155,10 @@ class Tracer { } } + phi::KernelSignature GetExpectedKernelSignature( + const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, framework::AttributeMap attrs) const; + paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists( const platform::Place& place); diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 26b8b9e8e17e046964d648f564c26293036e4033..5d0c3c98d2f618eb1f3d41e6a4e2434e5cd80401 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -45,6 +45,11 @@ add_subdirectory(api) set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg}) + +if(WITH_ONNXRUNTIME) + set(STATIC_INFERENCE_API ${STATIC_INFERENCE_API} onnxruntime_predictor) +endif() + #TODO(wilber, T8T9): Do we still need to support windows gpu static library? if(WIN32 AND WITH_GPU) cc_library(paddle_inference DEPS ${fluid_modules} ${phi_modules} ${STATIC_INFERENCE_API} ${utils_modules}) @@ -91,6 +96,13 @@ if (WITH_PSCORE) set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} fleet ps_service) endif () +if (WITH_ONNXRUNTIME) + set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS} + ${CMAKE_CURRENT_SOURCE_DIR}/api/onnxruntime_predictor.cc + ) + set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} onnxruntime_predictor) +endif (WITH_ONNXRUNTIME) + # Create shared inference library cc_library(paddle_inference_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${SHARED_INFERENCE_DEPS}) diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 87efe5ec5190372b48f1bd6387e1c92f456865a1..bdc16ef4c7907764473c552461cde35f011ad489 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -31,7 +31,7 @@ cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tens cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) set(paddle_inference_api_deps lod_tensor scope reset_tensor_array - analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator custom_kernel) + analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator) if(WITH_CRYPTO) list(APPEND paddle_inference_api_deps paddle_crypto) @@ -49,8 +49,15 @@ if(WITH_GPU AND TENSORRT_FOUND) set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter) endif() -cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} - zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils) +if (WITH_ONNXRUNTIME) + cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} + zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils onnxruntime paddle2onnx) + cc_library(onnxruntime_predictor SRCS onnxruntime_predictor.cc DEPS analysis_predictor) +else (WITH_ONNXRUNTIME) + cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} + zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils) +endif (WITH_ONNXRUNTIME) + cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) @@ -75,6 +82,16 @@ elseif (WIN32) ARGS --dirname=${WORD2VEC_MODEL_DIR}) endif() +if (WITH_ONNXRUNTIME) + if (NOT APPLE AND NOT WIN32) + cc_test(test_onnxruntime_predictor SRCS onnxruntime_predictor_tester.cc DEPS paddle_inference_shared + ARGS --dirname=${MOBILENETV2_MODEL_DIR}) + elseif (WIN32) + cc_test(test_onnxruntime_predictor SRCS onnxruntime_predictor_tester.cc DEPS onnxruntime_predictor benchmark ${inference_deps} + ARGS --dirname=${MOBILENETV2_MODEL_DIR}) + endif() +endif() + if(WITH_TESTING AND WITH_MKLDNN) if (NOT APPLE AND NOT WIN32) cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR}) diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index fd2ccffae3b4af3280f622722d6080d7c68bfbad..41c01d3b7e261314d8dc6b852f5b2a597421fe48 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -168,6 +168,33 @@ void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16, int ipu_replica_num, Update(); } +void AnalysisConfig::EnableONNXRuntime() { +#ifdef PADDLE_WITH_ONNXRUNTIME + use_onnxruntime_ = true; +#else + LOG(ERROR) << "Please compile with onnxruntime to EnableONNXRuntime()"; + use_onnxruntime_ = false; +#endif + + Update(); +} + +void AnalysisConfig::DisableONNXRuntime() { + use_onnxruntime_ = false; + Update(); +} + +void AnalysisConfig::EnableORTOptimization() { +#ifdef PADDLE_WITH_ONNXRUNTIME + enable_ort_optimization_ = true; +#else + LOG(ERROR) << "Please compile with onnxruntime to EnableORTOptimization()"; + enable_ort_optimization_ = false; +#endif + + Update(); +} + AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { #define CP_MEMBER(member__) member__ = other.member__; @@ -274,6 +301,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(ipu_available_memory_proportion_); CP_MEMBER(ipu_enable_half_partial_); + // fleet exe related + CP_MEMBER(dist_config_); + if (use_gpu_) { PADDLE_ENFORCE_EQ(use_xpu_, false, platform::errors::InvalidArgument( diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index cd6e3a3c759c05bda34978dd78d07358aacd53fe..871ed596a3ee9d6362b03e99ca10313765826a51 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -30,6 +30,7 @@ #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/framework/version.h" @@ -47,6 +48,14 @@ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/phi/api/ext/op_meta_info.h" +#include "paddle/utils/string/split.h" + +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" +#include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h" +#include "paddle/fluid/distributed/fleet_executor/task_node.h" +#endif #ifdef PADDLE_WITH_MKLML #include "paddle/fluid/platform/dynload/mklml.h" @@ -56,6 +65,10 @@ #include "paddle/fluid/inference/api/mkldnn_quantizer.h" #endif +#ifdef PADDLE_WITH_ONNXRUNTIME +#include "paddle/fluid/inference/api/onnxruntime_predictor.h" +#endif + #if PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/helper.h" @@ -71,6 +84,8 @@ using inference::tensorrt::TRTCalibratorEngine; using inference::tensorrt::TRTCalibratorEngineManager; #endif +int AnalysisPredictor::clone_num_ = 1; + namespace { bool IsPersistable(const framework::VarDesc *var) { if (var->Persistable() && @@ -186,14 +201,14 @@ bool AnalysisPredictor::Init( return false; } + // Get the feed_target_names and fetch_target_names + PrepareFeedFetch(); + // Prepare executor, create local variables. if (!PrepareExecutor()) { return true; } - // Get the feed_target_names and fetch_target_names - PrepareFeedFetch(); - return true; } @@ -359,6 +374,13 @@ static void DisablePrepareDataOpt( } bool AnalysisPredictor::PrepareExecutor() { +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) + if (config_.dist_config().use_dist_model()) { + VLOG(3) << "use_dist_model is enabled, will init FleetExecutor."; + return PrepareFleetExecutor(); + } +#endif DisablePrepareDataOpt(inference_program_, 0, false); executor_->Prepare(sub_scope_, *inference_program_, 0, @@ -371,6 +393,226 @@ bool AnalysisPredictor::PrepareExecutor() { return true; } +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) +bool AnalysisPredictor::PrepareFleetExecutor() { + VLOG(3) << "AnalysisPredictor::PrepareFleetExecutor()"; + if (config_.dist_config().nranks() > 1 && !CommInit()) { + return false; + } + task_node_.reset(new distributed::TaskNode(inference_program_.get(), + config_.dist_config().rank())); + // With auto cut, there is no concept of pp, no need to add dependency. + task_node_->SetType("Compute"); + task_node_->Init(config_.use_feed_fetch_ops_enabled()); + executor_desc_ = distributed::FleetExecutorDesc(); + executor_desc_.set_cur_rank(config_.dist_config().rank()); + std::unordered_map id_to_rank; + for (int i = 0; i < config_.dist_config().nranks(); ++i) { + distributed::RankInfo *rank_info = executor_desc_.add_cluster_info(); + rank_info->set_rank(i); + rank_info->set_ip_port(config_.dist_config().trainer_endpoints()[i]); + id_to_rank.insert({i, i}); + } + fleet_exe_.reset(new distributed::FleetExecutor(executor_desc_)); + // NOTE: Vars of feed fetch ops are not persistable, + // which will result in that those vars will be created in + // the subscope (microscope) in fleet executor. This will + // cause that the GetInputTensor/GetOutputTensor funct + // in analysis predictor cannot find those vars in the scope + // returned by the DistModel, since DistModel only return the + // root scope. So, those vars must to be created in the root + // scope instead of in the microscope + std::vector feed_fetch_vars; + for (auto pair : idx2feeds_) { + feed_fetch_vars.emplace_back(pair.second); + } + for (auto pair : idx2fetches_) { + feed_fetch_vars.emplace_back(pair.second); + } + fleet_exe_->Init(config_.dist_config().carrier_id(), + *(inference_program_.get()), scope_.get(), place_, 1, + {task_node_.get()}, id_to_rank, feed_fetch_vars); + return true; +} + +bool AnalysisPredictor::CommInit() { + std::map> ring_id_to_ranks{}; + std::map> rank_to_ring_ids{}; + if (!LoadConverterConfig(&ring_id_to_ranks, &rank_to_ring_ids)) { + VLOG(3) << "Load converter config failed, DistModel init failed."; + return false; + } + std::unique_ptr comm_init_program( + new framework::ProgramDesc()); + framework::BlockDesc *comm_init_block = comm_init_program->MutableBlock(0); + std::vector &ring_ids = + rank_to_ring_ids[config_.dist_config().rank()]; + int64_t order = 0; + std::string var_name_base = "comm_init_"; + for (int64_t ring_id : ring_ids) { + VLOG(3) << "Init comm for ring id: " << ring_id; + int64_t ranks_in_group = ring_id_to_ranks[ring_id].size(); + int64_t rank_in_group = 0; + std::vector &ranks = ring_id_to_ranks[ring_id]; + for (int64_t rank : ranks) { + if (config_.dist_config().rank() == rank) { + break; + } + rank_in_group += 1; + } + std::vector peer_endpoints; + for (int64_t rank : ranks) { + if (config_.dist_config().rank() == rank) { + continue; + } + peer_endpoints.emplace_back( + config_.dist_config().trainer_endpoints()[rank]); + } + InsertCommOp(var_name_base + std::to_string(order), ranks_in_group, + rank_in_group, peer_endpoints, comm_init_block, ring_id); + order += 1; + } + framework::NaiveExecutor e(place_); + e.CreateVariables(*comm_init_program, 0, true, scope_.get()); + e.Prepare(scope_.get(), *comm_init_program, 0, false); + e.Run(); + VLOG(3) << "Comm init successful."; + return true; +} + +void AnalysisPredictor::InsertCommOp( + std::string tmp_var_name, int nranks, int rank, + const std::vector &peer_endpoints, framework::BlockDesc *block, + int ring_id) { + /* + * tmp_var_name: the var name for var comm_id + * nranks: number of total ranks + * rank: the rank of local rank in the comm group + * peer_endpoints: peer's endpoints + * block: the block where to insert the comm ops + * ring_id: the ring_id to be inited + */ + const std::string &endpoint = config_.dist_config().current_endpoint(); + std::stringstream ss; + ss << "Init comm with tmp var: " << tmp_var_name + << ". The ring id is: " << ring_id << ". The group has: " << nranks + << " ranks. Current rank in the group is: " << rank + << ". The endpoint is: " << endpoint << ". Peer endpoints are: "; + for (auto ep : peer_endpoints) { + ss << ep << ", "; + } + VLOG(3) << ss.str(); + if (config_.use_gpu()) { + framework::VarDesc *new_var = block->Var(tmp_var_name); + new_var->SetType(framework::proto::VarType::RAW); + new_var->SetPersistable(true); + framework::OpDesc *gen_nccl_id_op = block->AppendOp(); + gen_nccl_id_op->SetType("c_gen_nccl_id"); + gen_nccl_id_op->SetOutput("Out", {tmp_var_name}); + gen_nccl_id_op->SetAttr("rank", rank); + gen_nccl_id_op->SetAttr("endpoint", + config_.dist_config().current_endpoint()); + gen_nccl_id_op->SetAttr("other_endpoints", peer_endpoints); + gen_nccl_id_op->SetAttr("ring_id", ring_id); + gen_nccl_id_op->SetAttr("op_role", + static_cast(framework::OpRole::kForward)); + gen_nccl_id_op->CheckAttrs(); + framework::OpDesc *comm_init_op = block->AppendOp(); + comm_init_op->SetType("c_comm_init"); + comm_init_op->SetInput("X", {tmp_var_name}); + comm_init_op->SetAttr("rank", rank); + comm_init_op->SetAttr("nranks", nranks); + comm_init_op->SetAttr("ring_id", ring_id); + comm_init_op->SetAttr("op_role", + static_cast(framework::OpRole::kForward)); + comm_init_op->CheckAttrs(); + } else { + LOG(WARNING) << "DistModelInf doesn't init comm."; + // TODO(fleet exe dev): comm init for more devices + } +} + +bool AnalysisPredictor::LoadConverterConfig( + std::map> *ring_id_to_ranks, + std::map> *rank_to_ring_ids) { + VLOG(3) << "Going to load converter config from: " + << config_.dist_config().comm_init_config() << "\n"; + std::ifstream fin(config_.dist_config().comm_init_config(), std::ios::in); + PADDLE_ENFORCE_EQ( + static_cast(fin.is_open()), true, + platform::errors::NotFound( + "Cannot open file %s, please confirm whether the file is normal.", + config_.dist_config().comm_init_config())); + std::string line; + bool ring_to_rank{true}; + // Reading config from file, the config file should like these format + // [ring_id -> ranks] + // 0,0,1,2,3 + // 1,0,1 + // 2,2,3 + // 21,0,1 + // 22,1,2 + // 23,2,3 + // [rank -> ring_ids] + // 0,0,1,21 + // 1,0,1,21,22 + // 2,0,2,22,23 + // 3,0,2,23 + while (std::getline(fin, line)) { + std::vector one_line = paddle::string::Split(line, ','); + if (one_line.size() == 1) { + // start a new section of the config + if (line == "[ring_id -> ranks]") { + ring_to_rank = true; + } else if (line == "[rank -> ring_ids]") { + ring_to_rank = false; + } + } else { + // parse key - values pairs in one section + int64_t key = std::stoll(one_line[0]); + for (size_t i = 1; i < one_line.size(); ++i) { + int64_t val = std::stoll(one_line[i]); + if (ring_to_rank) { + if (ring_id_to_ranks->find(key) == ring_id_to_ranks->end()) { + ring_id_to_ranks->insert({key, std::vector()}); + } + ring_id_to_ranks->at(key).emplace_back(val); + } else { + if (rank_to_ring_ids->find(key) == rank_to_ring_ids->end()) { + rank_to_ring_ids->insert({key, std::vector()}); + } + rank_to_ring_ids->at(key).emplace_back(val); + } + // NOTE: add more configuration sections here + } + } + } + std::stringstream ss; + ss << "Loaded the following converter config:\n"; + ss << "ring_id_to_ranks:\n"; + for (auto pair : *ring_id_to_ranks) { + int64_t key = pair.first; + ss << "\t" << key << "\t->\t"; + for (auto value : pair.second) { + ss << value << "\t"; + } + ss << "\n"; + } + ss << "rank_to_ring_ids:\n"; + for (auto pair : *rank_to_ring_ids) { + int64_t key = pair.first; + ss << "\t" << key << "\t->\t"; + for (auto value : pair.second) { + ss << value << "\t"; + } + ss << "\n"; + } + VLOG(3) << ss.str(); + return true; +} +#endif + void AnalysisPredictor::MkldnnPreSet(const std::vector &inputs) { #ifdef PADDLE_WITH_MKLDNN std::vector> inputs_shape; @@ -946,13 +1188,24 @@ std::vector AnalysisPredictor::GetOutputNames() { std::unique_ptr AnalysisPredictor::GetInputTensor( const std::string &name) { + framework::Scope *scope; +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) + if (config_.dist_config().use_dist_model()) { + scope = scope_.get(); + } else { + scope = executor_->scope(); + } +#else + scope = executor_->scope(); +#endif PADDLE_ENFORCE_NOT_NULL( - executor_->scope()->FindVar(name), + scope->FindVar(name), platform::errors::PreconditionNotMet( - "The variable named %s is not found in the scope of the exector.", + "The variable named %s is not found in the scope of the executor.", name)); std::unique_ptr res( - new ZeroCopyTensor(static_cast(executor_->scope()))); + new ZeroCopyTensor(static_cast(scope))); res->input_or_output_ = true; res->SetName(name); if (platform::is_cpu_place(place_)) { @@ -985,13 +1238,24 @@ std::unique_ptr AnalysisPredictor::GetInputTensor( std::unique_ptr AnalysisPredictor::GetOutputTensor( const std::string &name) { + framework::Scope *scope; +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) + if (config_.dist_config().use_dist_model()) { + scope = scope_.get(); + } else { + scope = executor_->scope(); + } +#else + scope = executor_->scope(); +#endif PADDLE_ENFORCE_NOT_NULL( - executor_->scope()->FindVar(name), + scope->FindVar(name), platform::errors::PreconditionNotMet( - "he variable named %s is not found in the scope of the exector.", + "The variable named %s is not found in the scope of the executor.", name)); std::unique_ptr res( - new ZeroCopyTensor(static_cast(executor_->scope()))); + new ZeroCopyTensor(static_cast(scope))); res->input_or_output_ = false; res->SetName(name); if (platform::is_cpu_place(place_)) { @@ -1023,6 +1287,18 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( } bool AnalysisPredictor::ZeroCopyRun() { +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) + if (config_.dist_config().use_dist_model()) { + VLOG(3) << "ZeroCopyRun will use the fleet executor."; + inference::Timer timer; + timer.tic(); + fleet_exe_->Run(config_.dist_config().carrier_id()); + VLOG(3) << "Fleet executor inf runs once use: " + << std::to_string(timer.toc()) << "ms"; + return true; + } +#endif paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); #ifdef PADDLE_WITH_MKLDNN if (config_.use_mkldnn_) { @@ -1035,7 +1311,6 @@ bool AnalysisPredictor::ZeroCopyRun() { MkldnnPreSet(shape_vector); } #endif - executor_->Run(); if (config_.shape_range_info_collected()) { @@ -1364,7 +1639,7 @@ std::unique_ptr AnalysisPredictor::Clone() { std::lock_guard lk(clone_mutex_); auto *x = new AnalysisPredictor(config_); x->Init(scope_, inference_program_); - x->executor_->ResetTrtOps(++x->clone_num_); + x->executor_->ResetTrtOps(++AnalysisPredictor::clone_num_); return std::unique_ptr(x); } @@ -1491,6 +1766,27 @@ namespace paddle_infer { Predictor::Predictor(const Config &config) { const_cast(&config)->SwitchUseFeedFetchOps(false); // The second parameter indicates that the discard log is not printed + if (config.use_onnxruntime()) { +#ifdef PADDLE_WITH_ONNXRUNTIME + if (config.use_gpu()) { + LOG(WARNING) << "The current ONNXRuntime backend doesn't support GPU," + "and it falls back to use Paddle Inference."; + } else if (!paddle::CheckConvertToONNX(config)) { + LOG(WARNING) + << "Paddle2ONNX do't support convert the Model, fall back to using " + "Paddle Inference."; + } else { + predictor_ = paddle::CreatePaddlePredictor< + Config, paddle::PaddleEngineKind::kONNXRuntime>(config); + return; + } +#else + LOG(WARNING) + << "The onnxruntime backend isn't enabled," + " and please re-compile Paddle with WITH_ONNXRUNTIME option," + "fall back to using Paddle Inference."; +#endif + } predictor_ = paddle::CreatePaddlePredictor< Config, paddle::PaddleEngineKind::kAnalysis>(config); } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index a8e56101d37dabe8837b8adde9672ce45ffd62a0..21a7e9658bbeeb16d4cbff6364aaef68edcae16d 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -18,6 +18,10 @@ #include #include #include +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" +#endif #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/op_compatible_info.h" #include "paddle/fluid/inference/analysis/analyzer.h" @@ -391,6 +395,53 @@ class AnalysisPredictor : public PaddlePredictor { void StatisticShapeRangeInfo(); void CollectShapeRangeInfo(); +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) + // fleet exe related + + /// + /// \brief prepare for fleet executor to run + /// + /// Used in AnalysisPredictor::Init(), + /// + bool PrepareFleetExecutor(); + + /// + /// \brief init NCCL env for multi gpus inference + /// + /// Used in AnalysisPredictor::PrepareFleetExecutor() + /// + bool CommInit(); + + /// + /// \brief read the config to init NCCL env + /// + /// Used in AnalysisPredictor::CommInit() + /// + /// \param[in] ring_id_to_ranks: a ptr to ring_id_to_ranks + /// \param[in] rank_to_ring_ids: a ptr to rank_to_ring_ids + /// + bool LoadConverterConfig( + std::map> *ring_id_to_ranks, + std::map> *rank_to_ring_ids); + + /// + /// \brief add ops and run them with NaiveExecutor to init NCCL env + /// + /// Used in AnalysisPredictor::CommInit() + /// + /// \param[in] tmp_var_name: var name to hold NCCL unique id + /// \param[in] nranks: number of ranks in one comm group + /// \param[in] rank: relative rank of current rank in the comm group + /// \param[in] peer_endpoints: group's peers' endpoints + /// \param[in] block: the block to insert comm ops + /// \param[in] ring_id: the ring id to be used to init NCCL env + /// + void InsertCommOp(std::string tmp_var_name, int nranks, int rank, + const std::vector &peer_endpoints, + framework::BlockDesc *block, int ring_id); +#endif + private: AnalysisConfig config_; Argument argument_; @@ -435,7 +486,15 @@ class AnalysisPredictor : public PaddlePredictor { bool status_is_cloned_{false}; std::map>> shape_info_; - int clone_num_{1}; + static int clone_num_; + +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) + // fleet executor related + distributed::FleetExecutorDesc executor_desc_; + std::shared_ptr fleet_exe_; + std::shared_ptr task_node_; +#endif }; } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index a15a1cd84b14094c6ea95f94ffaaf31f4a790376..2c6e8f4f1a4d9ea0dfba8f400c7d3782a5e2c32d 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -13,6 +13,9 @@ // limitations under the License. #include "paddle/fluid/inference/api/analysis_predictor.h" +#if defined(PADDLE_WITH_CUDA) +#include +#endif #include #include #include // NOLINT @@ -354,6 +357,24 @@ TEST(AnalysisPredictor, set_xpu_device_id) { } #endif +TEST(AnalysisPredictor, enable_onnxruntime) { + AnalysisConfig config; + config.EnableONNXRuntime(); +#ifdef PADDLE_WITH_ONNXRUNTIME + ASSERT_TRUE(config.use_onnxruntime()); +#else + ASSERT_TRUE(!config.use_onnxruntime()); +#endif + config.EnableORTOptimization(); +#ifdef PADDLE_WITH_ONNXRUNTIME + ASSERT_TRUE(config.ort_optimization_enabled()); +#else + ASSERT_TRUE(!config.ort_optimization_enabled()); +#endif + config.DisableONNXRuntime(); + ASSERT_TRUE(!config.use_onnxruntime()); +} + } // namespace paddle namespace paddle_infer { @@ -405,4 +426,91 @@ TEST(Predictor, Run) { predictor->TryShrinkMemory(); } +TEST(Predictor, EnableONNXRuntime) { + Config config; + config.SetModel(FLAGS_dirname); + config.EnableONNXRuntime(); + config.EnableORTOptimization(); + auto predictor = CreatePredictor(config); +} + +TEST(Tensor, CpuShareExternalData) { + Config config; + config.SetModel(FLAGS_dirname); + + auto predictor = CreatePredictor(config); + + auto w0 = predictor->GetInputHandle("firstw"); + auto w1 = predictor->GetInputHandle("secondw"); + auto w2 = predictor->GetInputHandle("thirdw"); + auto w3 = predictor->GetInputHandle("forthw"); + + std::vector> input_data(4, {0, 1, 2, 3}); + w0->ShareExternalData(input_data[0].data(), {4, 1}, PlaceType::kCPU); + w1->ShareExternalData(input_data[1].data(), {4, 1}, PlaceType::kCPU); + w2->ShareExternalData(input_data[2].data(), {4, 1}, PlaceType::kCPU); + w3->ShareExternalData(input_data[3].data(), {4, 1}, PlaceType::kCPU); + + auto out = predictor->GetOutputHandle("fc_1.tmp_2"); + auto out_shape = out->shape(); + std::vector out_data; + out_data.resize(std::accumulate(out_shape.begin(), out_shape.end(), 1, + std::multiplies())); + out->ShareExternalData(out_data.data(), out_shape, PlaceType::kCPU); + + predictor->Run(); + + PlaceType place; + int size = 0; + out->data(&place, &size); + LOG(INFO) << "output size: " << size / sizeof(float); + predictor->TryShrinkMemory(); +} + +#if defined(PADDLE_WITH_CUDA) +TEST(Tensor, GpuShareExternalData) { + Config config; + config.SetModel(FLAGS_dirname); + config.EnableUseGpu(100, 0); + + auto predictor = CreatePredictor(config); + + auto w0 = predictor->GetInputHandle("firstw"); + auto w1 = predictor->GetInputHandle("secondw"); + auto w2 = predictor->GetInputHandle("thirdw"); + auto w3 = predictor->GetInputHandle("forthw"); + + std::vector> input_data(4, {0, 1, 2, 3}); + std::vector input_gpu(4, nullptr); + + for (size_t i = 0; i < 4; ++i) { + cudaMalloc(reinterpret_cast(&input_gpu[i]), 4 * sizeof(int64_t)); + cudaMemcpy(input_gpu[i], input_data[i].data(), 4 * sizeof(int64_t), + cudaMemcpyHostToDevice); + } + + w0->ShareExternalData(input_gpu[0], {4, 1}, PlaceType::kGPU); + w1->ShareExternalData(input_gpu[1], {4, 1}, PlaceType::kGPU); + w2->ShareExternalData(input_gpu[2], {4, 1}, PlaceType::kGPU); + w3->ShareExternalData(input_gpu[3], {4, 1}, PlaceType::kGPU); + + auto out = predictor->GetOutputHandle("fc_1.tmp_2"); + auto out_shape = out->shape(); + float* out_data; + auto out_size = std::accumulate(out_shape.begin(), out_shape.end(), 1, + std::multiplies()) * + sizeof(float); + cudaMalloc(reinterpret_cast(out_data), out_size * sizeof(float)); + out->ShareExternalData(out_data, out_shape, PlaceType::kGPU); + + predictor->Run(); + + PlaceType place; + int size = 0; + out->data(&place, &size); + LOG(INFO) << "output size: " << size / sizeof(float); + predictor->TryShrinkMemory(); +} +#endif + } // namespace paddle_infer diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index d03840ada36bce8cfdc2213284697e6d873cbde0..df98a7b05cf3f2035e9a21ec10e4b44eca843bbd 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -4,6 +4,7 @@ option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL. option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF) option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON) option(USE_TENSORRT "Compile demo with TensorRT." OFF) +option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime" OFF) if(NOT WITH_STATIC_LIB) add_definitions("-DPADDLE_WITH_SHARED_LIB") @@ -46,6 +47,13 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") link_directories("${PADDLE_LIB}/paddle/lib") +if (WITH_ONNXRUNTIME) + include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include") + include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include") + + link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib") + link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib") +endif() if (WIN32) add_definitions("/DGOOGLE_GLOG_DLL_DECL=") @@ -151,6 +159,17 @@ else() endif() endif() +if (WITH_ONNXRUNTIME) + if(WIN32) + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx) + elseif(APPLE) + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx) + else() + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx) + endif() +endif() + + if (NOT WIN32) set(EXTERNAL_LIB "-lrt -ldl -lpthread") set(DEPS ${DEPS} @@ -213,6 +232,14 @@ if(WIN32) COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release ) endif() + if(WITH_ONNXRUNTIME) + add_custom_command(TARGET ${DEMO_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + ) + endif() if(NOT WITH_STATIC_LIB) add_custom_command(TARGET ${DEMO_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} diff --git a/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc new file mode 100644 index 0000000000000000000000000000000000000000..ef5c08cd041eb7af4c7f17a95c4fd9b8601e4bad --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc @@ -0,0 +1,64 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file contains demo of mobilenet for tensorrt. + */ + +#include // use glog instead of CHECK to avoid importing other paddle header files. +#include +#include "gflags/gflags.h" +#include "utils.h" // NOLINT + +DEFINE_string(modeldir, "", "Directory of the inference model."); + +namespace paddle { +namespace demo { + +/* + * Use the onnxruntime engine to inference the demo. + */ +void Main() { + paddle::AnalysisConfig config; + config.EnableONNXRuntime(); + config.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + auto predictor = paddle_infer::CreatePredictor(config); + + // Inference. + std::vector input_shape = {1, 3, 224, 224}; + std::vector input_data(1 * 3 * 224 * 224, 1.0); + std::vector out_data; + out_data.resize(1000); + auto input_names = predictor->GetInputNames(); + auto output_names = predictor->GetOutputNames(); + auto input_tensor = predictor->GetInputHandle(input_names[0]); + input_tensor->Reshape(input_shape); + auto output_tensor = predictor->GetOutputHandle(output_names[0]); + + input_tensor->CopyFromCpu(input_data.data()); + predictor->Run(); + output_tensor->CopyToCpu(out_data.data()); + + VLOG(3) << "output.size " << out_data.size(); +} + +} // namespace demo +} // namespace paddle + +int main(int argc, char** argv) { + ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); + paddle::demo::Main(); + return 0; +} diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 5f062e8063253a08466b2491e80417af07047394..79a31555c7f0b1cb4a8d9c48bae16145d605935b 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -21,7 +21,8 @@ TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode DATA_DIR=$4 # dataset USE_TENSORRT=$5 TENSORRT_ROOT_DIR=$6 # TensorRT root dir, default to /usr -MSVC_STATIC_CRT=$7 +WITH_ONNXRUNTIME=$7 +MSVC_STATIC_CRT=$8 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform @@ -38,6 +39,26 @@ else use_gpu_list='false' fi +mkdir -p $DATA_DIR +cd $DATA_DIR + +if [ $7 == ON ]; then + ONNXRUNTIME_LIB=${inference_install_dir}/third_party/install/onnxruntime/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ONNXRUNTIME_LIB} + PADDLE2ONNX_LIB=${inference_install_dir}/third_party/install/paddle2onnx/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${PADDLE2ONNX_LIB} + #download model + mkdir -p MobileNetV2 + cd MobileNetV2 + if [[ -e "MobileNetV2.inference.model.tar.gz" ]]; then + echo "MobileNetV2.inference.model.tar.gz has been downloaded." + else + wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz + tar xzf *.tar.gz + fi + cd .. +fi + PREFIX=inference-vis-demos%2F URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX} @@ -58,8 +79,7 @@ function download() { fi cd .. } -mkdir -p $DATA_DIR -cd $DATA_DIR + vis_demo_list='se_resnext50 ocr mobilenet' for vis_demo_name in $vis_demo_list; do download $vis_demo_name @@ -93,7 +113,8 @@ for WITH_STATIC_LIB in ON OFF; do -DDEMO_NAME=simple_on_word2vec \ -DWITH_GPU=$TEST_GPU_CPU \ -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ - -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT + -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME msbuild /maxcpucount /property:Configuration=Release cpp_inference_demo.sln for use_gpu in $use_gpu_list; do Release/simple_on_word2vec.exe \ @@ -112,7 +133,8 @@ for WITH_STATIC_LIB in ON OFF; do -DDEMO_NAME=vis_demo \ -DWITH_GPU=$TEST_GPU_CPU \ -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ - -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT + -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME msbuild /maxcpucount /property:Configuration=Release cpp_inference_demo.sln for use_gpu in $use_gpu_list; do for vis_demo_name in $vis_demo_list; do @@ -138,7 +160,8 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \ -DUSE_TENSORRT=$USE_TENSORRT \ - -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR + -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME msbuild /maxcpucount /property:Configuration=Release cpp_inference_demo.sln Release/trt_mobilenet_demo.exe \ --modeldir=$DATA_DIR/mobilenet/model \ @@ -156,7 +179,8 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_MKL=$TURN_ON_MKL \ -DDEMO_NAME=simple_on_word2vec \ -DWITH_GPU=$TEST_GPU_CPU \ - -DWITH_STATIC_LIB=$WITH_STATIC_LIB + -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME make -j$(nproc) word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model' if [ -d $word2vec_model ]; then @@ -176,7 +200,8 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_MKL=$TURN_ON_MKL \ -DDEMO_NAME=vis_demo \ -DWITH_GPU=$TEST_GPU_CPU \ - -DWITH_STATIC_LIB=$WITH_STATIC_LIB + -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME make -j$(nproc) for use_gpu in $use_gpu_list; do for vis_demo_name in $vis_demo_list; do @@ -200,7 +225,8 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_GPU=$TEST_GPU_CPU \ -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ -DUSE_TENSORRT=$USE_TENSORRT \ - -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR + -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME make -j$(nproc) ./trt_mobilenet_demo \ --modeldir=$DATA_DIR/mobilenet/model \ @@ -211,6 +237,26 @@ for WITH_STATIC_LIB in ON OFF; do exit 1 fi fi + + # --------onnxruntime mobilenetv2 on linux/mac------ + if [ $WITH_ONNXRUNTIME == ON ]; then + rm -rf * + cmake .. -DPADDLE_LIB=${inference_install_dir} \ + -DWITH_MKL=$TURN_ON_MKL \ + -DDEMO_NAME=onnxruntime_mobilenet_demo \ + -DWITH_GPU=$TEST_GPU_CPU \ + -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ + -DUSE_TENSORRT=$USE_TENSORRT \ + -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME + make -j$(nproc) + ./onnxruntime_mobilenet_demo \ + --modeldir=$DATA_DIR/MobileNetV2/MobileNetV2 + if [ $? -ne 0 ]; then + echo "onnxruntime demo onnxruntime_mobilenet_demo runs fail." + exit 1 + fi + fi fi done set +x diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 1d09b01f8f852f2bb7f668d0e2b4ee3250c9cc64..18b1d09f0e8a7c4be9862991060a4706ee7cde7e 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/core/allocator.h" namespace paddle_infer { @@ -205,6 +206,73 @@ void Tensor::CopyFromCpu(const T *data) { } } +template +struct DataTypeInfo; + +template <> +struct DataTypeInfo { + paddle::experimental::DataType TYPE = paddle::experimental::DataType::FLOAT32; +}; + +template <> +struct DataTypeInfo { + paddle::experimental::DataType TYPE = paddle::experimental::DataType::FLOAT16; +}; + +template <> +struct DataTypeInfo { + paddle::experimental::DataType TYPE = paddle::experimental::DataType::INT64; +}; + +template <> +struct DataTypeInfo { + paddle::experimental::DataType TYPE = paddle::experimental::DataType::INT8; +}; + +template <> +struct DataTypeInfo { + paddle::experimental::DataType TYPE = paddle::experimental::DataType::UINT8; +}; + +template <> +struct DataTypeInfo { + paddle::experimental::DataType TYPE = paddle::experimental::DataType::INT32; +}; + +paddle::experimental::DataLayout LayoutConvert(DataLayout layout) { + PADDLE_ENFORCE_EQ( + layout, DataLayout::kNCHW, + paddle::platform::errors::InvalidArgument("Only NCHW is supported now.")); + return paddle::experimental::DataLayout::NCHW; +} + +template +void Tensor::ShareExternalData(const T *data, const std::vector &shape, + PlaceType place, DataLayout layout) { + EAGER_GET_TENSOR(paddle::framework::LoDTensor) + size_t size = + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()) * + sizeof(T); + phi::DenseTensorMeta meta(DataTypeInfo().TYPE, phi::make_ddim(shape), + LayoutConvert(layout)); + if (place == PlaceType::kCPU) { + phi::DenseTensor dtensor( + std::make_shared(const_cast(data), size, + paddle::platform::CPUPlace()), + meta); + *tensor = std::move(dtensor); + } else if (place == PlaceType::kGPU) { + phi::DenseTensor dtensor( + std::make_shared(const_cast(data), size, + paddle::platform::CUDAPlace(device_)), + meta); + *tensor = std::move(dtensor); + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "PlaceType must be PlaceType::kCPU or PlaceType::kGPU.")); + } +} + void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) { EAGER_GET_TENSOR(paddle_infer::Strings); PADDLE_ENFORCE_GE(tensor->size(), 0, @@ -334,6 +402,25 @@ template PD_INFER_DECL void Tensor::CopyFromCpu(const uint8_t *data); template PD_INFER_DECL void Tensor::CopyFromCpu(const int8_t *data); template PD_INFER_DECL void Tensor::CopyFromCpu(const float16 *data); +template PD_INFER_DECL void Tensor::ShareExternalData( + const float *data, const std::vector &shape, PlaceType place, + DataLayout layout); +template PD_INFER_DECL void Tensor::ShareExternalData( + const int64_t *data, const std::vector &shape, PlaceType place, + DataLayout layout); +template PD_INFER_DECL void Tensor::ShareExternalData( + const int32_t *data, const std::vector &shape, PlaceType place, + DataLayout layout); +template PD_INFER_DECL void Tensor::ShareExternalData( + const uint8_t *data, const std::vector &shape, PlaceType place, + DataLayout layout); +template PD_INFER_DECL void Tensor::ShareExternalData( + const int8_t *data, const std::vector &shape, PlaceType place, + DataLayout layout); +template PD_INFER_DECL void Tensor::ShareExternalData( + const float16 *data, const std::vector &shape, PlaceType place, + DataLayout layout); + template PD_INFER_DECL void Tensor::CopyToCpu(float *data) const; template PD_INFER_DECL void Tensor::CopyToCpu(int64_t *data) const; template PD_INFER_DECL void Tensor::CopyToCpu(int32_t *data) const; diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc new file mode 100644 index 0000000000000000000000000000000000000000..ee82da139d8f39c26002763c4a4835050c48fc99 --- /dev/null +++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc @@ -0,0 +1,354 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/onnxruntime_predictor.h" + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid//platform/device/gpu/gpu_types.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/framework/version.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include "paddle/fluid/inference/utils/io_utils.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { + +framework::proto::VarType::Type ConvertONNXType( + ONNXTensorElementDataType type) { + switch (type) { + case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: + return framework::proto::VarType::FP32; + // case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: + // return DataType::FP16; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: + return framework::proto::VarType::INT8; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: + return framework::proto::VarType::INT32; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: + return framework::proto::VarType::INT64; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: + return framework::proto::VarType::UINT8; + default: + LOG(ERROR) << "unsupported ONNX Tensor Type: " << static_cast(type); + return framework::proto::VarType::FP32; + } +} + +bool CheckConvertToONNX(const AnalysisConfig &config) { + if (!config.model_dir().empty()) { + LOG(ERROR) << "Paddle2ONNX not support model_dir config"; + // TODO(heliqi jiangjiajun): Paddle2ONNX not support + // config.model_dir() + "/__model__" + // config.model_dir() + var_name + return false; + } else if (config.prog_file().empty() || config.params_file().empty()) { + LOG(ERROR) << string::Sprintf( + "not valid model path '%s' or program path '%s' or params path '%s'.", + config.model_dir(), config.prog_file(), config.params_file()); + return false; + } + return paddle2onnx::IsExportable(config.prog_file(), config.params_file(), + config.model_from_memory()); +} + +bool ONNXRuntimePredictor::Init() { + VLOG(3) << "ONNXRuntime Predictor::init()"; + + // Now ONNXRuntime only suuport CPU + if (config_.use_gpu()) { + place_ = paddle::platform::CUDAPlace(config_.gpu_device_id()); + } else { + place_ = paddle::platform::CPUPlace(); + } + scope_.reset(new paddle::framework::Scope()); + sub_scope_ = &scope_->NewScope(); + + std::string onnx_proto; + paddle2onnx::Export(config_.prog_file(), config_.params_file(), &onnx_proto, + config_.model_from_memory()); + + Ort::SessionOptions session_options; + if (config_.ort_optimization_enabled()) { + session_options.SetGraphOptimizationLevel( + GraphOptimizationLevel::ORT_ENABLE_ALL); + } + // Turn optimization off first, and then turn it on when it's stable + // session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); + // session_options.EnableCpuMemArena(); + // session_options.EnableMemPattern(); + // session_options.SetInterOpNumThreads(config_.cpu_math_library_num_threads()); + session_options.SetIntraOpNumThreads(config_.cpu_math_library_num_threads()); + VLOG(2) << "ONNXRuntime threads " << config_.cpu_math_library_num_threads(); + if (config_.profile_enabled()) { + LOG(WARNING) << "ONNXRuntime Profiler is activated, which might affect the " + "performance"; +#if defined(_WIN32) + session_options.EnableProfiling(L"ONNX"); +#else + session_options.EnableProfiling("ONNX"); +#endif + } else { + VLOG(2) << "ONNXRuntime Profiler is deactivated, and no profiling report " + "will be " + "generated."; + } + session_ = {env_, onnx_proto.data(), onnx_proto.size(), session_options}; + + auto memory_info = + Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); + Ort::Allocator allocator(session_, memory_info); + + framework::proto::VarType::Type proto_type = + framework::proto::VarType::LOD_TENSOR; + size_t n_inputs = session_.GetInputCount(); + for (size_t i = 0; i < n_inputs; ++i) { + auto input_name = session_.GetInputName(i, allocator); + auto type_info = session_.GetInputTypeInfo(i); + std::vector shape = + type_info.GetTensorTypeAndShapeInfo().GetShape(); + ONNXTensorElementDataType data_type = + type_info.GetTensorTypeAndShapeInfo().GetElementType(); + input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type}); + auto *ptr = scope_->Var(input_name); + framework::InitializeVariable(ptr, proto_type); + allocator.Free(input_name); + } + + size_t n_outputs = session_.GetOutputCount(); + for (size_t i = 0; i < n_outputs; ++i) { + auto output_name = session_.GetOutputName(i, allocator); + auto type_info = session_.GetOutputTypeInfo(i); + std::vector shape = + type_info.GetTensorTypeAndShapeInfo().GetShape(); + ONNXTensorElementDataType data_type = + type_info.GetTensorTypeAndShapeInfo().GetElementType(); + output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type}); + auto *ptr = scope_->Var(output_name); + framework::InitializeVariable(ptr, proto_type); + allocator.Free(output_name); + } + + return true; +} + +template <> +std::unique_ptr +CreatePaddlePredictor( + const AnalysisConfig &config) { + if (config.glog_info_disabled()) { + FLAGS_logtostderr = 1; + FLAGS_minloglevel = 2; // GLOG_ERROR + } + + PADDLE_ENFORCE_EQ( + config.is_valid(), true, + platform::errors::InvalidArgument( + "Note: Each config can only be used for one predictor.")); + + VLOG(3) << "create ONNXRuntimePredictor"; + + std::unique_ptr predictor(new ONNXRuntimePredictor(config)); + // Each config can only be used for one predictor. + config.SetInValid(); + auto predictor_p = dynamic_cast(predictor.get()); + + if (!predictor_p->Init()) { + return nullptr; + } + + return predictor; +} + +std::vector ONNXRuntimePredictor::GetInputNames() { + std::vector input_names; + for (auto input_desc : input_desc_) { + input_names.push_back(input_desc.name); + } + return input_names; +} + +std::map> +ONNXRuntimePredictor::GetInputTensorShape() { + std::map> input_shapes; + for (auto input_desc : input_desc_) { + input_shapes[input_desc.name] = input_desc.shape; + } + return input_shapes; +} + +std::vector ONNXRuntimePredictor::GetOutputNames() { + std::vector output_names; + for (auto output_desc : output_desc_) { + output_names.push_back(output_desc.name); + } + return output_names; +} + +std::unique_ptr ONNXRuntimePredictor::GetInputTensor( + const std::string &name) { + PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name), + platform::errors::PreconditionNotMet( + "The in variable named %s is not found in the " + "scope of the ONNXPredictor.", + name)); + std::unique_ptr res( + new ZeroCopyTensor(static_cast(scope_.get()))); + res->input_or_output_ = true; + res->SetName(name); + if (platform::is_cpu_place(place_)) { + res->SetPlace(PaddlePlace::kCPU); + } else { + auto gpu_place = place_; + res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); + } + return res; +} + +std::unique_ptr ONNXRuntimePredictor::GetOutputTensor( + const std::string &name) { + PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name), + platform::errors::PreconditionNotMet( + "The out variable named %s is not found in the " + "scope of the ONNXPredictor.", + name)); + std::unique_ptr res( + new ZeroCopyTensor(static_cast(scope_.get()))); + res->input_or_output_ = false; + res->SetName(name); + if (platform::is_cpu_place(place_)) { + res->SetPlace(PaddlePlace::kCPU); + } else { + auto gpu_place = place_; + res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); + } + return res; +} + +Ort::Value ONNXRuntimePredictor::GetOrtValue(const ONNXDesc &desc, + const char *device_name) { + Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, + place_.GetDeviceId(), OrtMemTypeDefault); + auto *var = scope_->FindVar(desc.name); + auto *tensor = var->GetMutable(); + size_t size = + tensor->numel() * + framework::SizeOfType(framework::TransToProtoVarType(tensor->dtype())); + std::vector shape = phi::vectorize(tensor->dims()); + return Ort::Value::CreateTensor(memory_info, + static_cast(tensor->data()), size, + shape.data(), shape.size(), desc.dtype); +} + +void ONNXRuntimePredictor::AsTensor(const Ort::Value &value, + const ONNXDesc &desc) { + auto info = value.GetTensorTypeAndShapeInfo(); + + auto *var = scope_->FindVar(desc.name); + auto *tensor = var->GetMutable(); + tensor->Resize(phi::make_ddim(info.GetShape())); + auto dtype = ConvertONNXType(info.GetElementType()); + auto *ptr = tensor->mutable_data(place_, dtype); + + if (platform::is_cpu_place(place_)) { + std::memcpy(ptr, const_cast(value.GetTensorData()), + tensor->numel() * framework::SizeOfType(dtype)); + } else { + auto src_place = place_; + auto dst_place = place_; + memory::Copy(dst_place, ptr, src_place, + const_cast(value.GetTensorData()), + tensor->numel() * framework::SizeOfType(dtype)); + } +} + +bool ONNXRuntimePredictor::Run(const std::vector &inputs, + std::vector *output_data, + int batch_size) { + LOG(ERROR) << "Not support Run"; + return false; +} + +bool ONNXRuntimePredictor::ZeroCopyRun() { + try { + Ort::IoBinding binding(session_); + std::vector inputs; + std::vector outputs; + Ort::RunOptions options; + + inputs.reserve(input_desc_.size()); + const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu"; + for (auto desc : input_desc_) { + inputs.push_back(GetOrtValue(desc, device_name)); + binding.BindInput(desc.name.c_str(), inputs.back()); + } + + // TODO(heliqi): Optimization —— move to Init() + for (auto desc : output_desc_) { + Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, + place_.GetDeviceId(), OrtMemTypeDefault); + binding.BindOutput(desc.name.c_str(), memory_info); + } + + session_.Run({}, binding); + + outputs = binding.GetOutputValues(); + for (size_t i = 0; i < output_desc_.size(); ++i) { + AsTensor(outputs[i], output_desc_[i]); + } + } catch (const std::exception &e) { + LOG(ERROR) << e.what(); + return false; + } + + return true; +} + +std::unique_ptr ONNXRuntimePredictor::Clone() { + LOG(ERROR) << "Not support Clone(), Please create new Predictor"; + return nullptr; +} + +uint64_t ONNXRuntimePredictor::TryShrinkMemory() { + return paddle::memory::Release(place_); +} + +ONNXRuntimePredictor::~ONNXRuntimePredictor() { + if (sub_scope_) { + scope_->DeleteScope(sub_scope_); + } + memory::Release(place_); +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h new file mode 100644 index 0000000000000000000000000000000000000000..7fb07aa97bd2746773192456ddeba941a24e8906 --- /dev/null +++ b/paddle/fluid/inference/api/onnxruntime_predictor.h @@ -0,0 +1,225 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/framework/op_compatible_info.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/api/api_impl.h" +#include "paddle/fluid/inference/api/details/reset_tensor_array.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/device/gpu/gpu_types.h" +#include "paddle/fluid/string/printf.h" + +#include "onnxruntime_c_api.h" // NOLINT +#include "onnxruntime_cxx_api.h" // NOLINT +#include "paddle2onnx/converter.h" + +#ifdef PADDLE_WITH_TESTING +#include +#include +#endif + +/// +/// \file onnxruntime_predictor.h +/// +/// \brief A predictor using ONNXRuntime +/// +/// \author heliqi@baidu.com +/// \date 2022-02-14 +/// \since 2.3.0 +/// + +namespace paddle { + +bool CheckConvertToONNX(const AnalysisConfig &config); + +struct ONNXDesc { + std::string name; + std::vector shape; + ONNXTensorElementDataType dtype; +}; + +/// +/// \class ONNXRuntimePredictor +/// +/// \brief The ONNXRuntimePredictor using ONNXRuntime for inference +/// +/// The predictor has the following typical uses: +/// +/// Get predictor +/// \code{cpp} +/// auto predictor = CreatePaddlePredictor(config); +/// \endcode +/// +/// Get input or output names +/// \code{cpp} +/// auto input_names = predictor->GetInputNames(); +/// auto output_names = predictor->GetOutputNames(); +/// \endcode +/// +/// Get input or output tensors +/// \code{cpp} +/// auto input_t = predictor->GetInputTensor(input_names[0]); +/// auto output_t = predictor->GetOutputTensor(output_names[0]); +/// \endcode +/// +/// Run predictor +/// \code{cpp} +/// predictor->ZeroCopyRun(); +/// \endcode +/// +class ONNXRuntimePredictor : public PaddlePredictor { + public: + /// + /// \brief Construct a new ONNXRuntime Predictor object + /// + /// \param[in] AnalysisConfig config + /// + explicit ONNXRuntimePredictor(const AnalysisConfig &config) + : config_(config) { + predictor_id_ = inference::GetUniqueId(); + env_ = Ort::Env(ORT_LOGGING_LEVEL_INFO, "onnx"); + } + /// + /// \brief Destroy the ONNXRuntime Predictor object + /// + ~ONNXRuntimePredictor(); + + /// + /// \brief Initialize predictor + /// + /// \return Whether the init function executed successfully + /// + bool Init(); + + /// + /// \brief Get the input names + /// + /// \return input names + /// + std::vector GetInputNames(); + + /// + /// \brief Get the output names + /// + /// \return output names + /// + std::vector GetOutputNames(); + + /// + /// \brief Get the Input Tensor object + /// + /// \param[in] name input name + /// \return input tensor + /// + std::unique_ptr GetInputTensor( + const std::string &name) override; + + /// + /// \brief Get the Output Tensor object + /// + /// \param[in] name otuput name + /// \return output tensor + /// + std::unique_ptr GetOutputTensor( + const std::string &name) override; + /// + /// \brief Get all input names and their corresponding shapes + /// + /// \return the map of input names and shapes + /// + std::map> GetInputTensorShape() override; + + /// Not supoort + bool Run(const std::vector &inputs, + std::vector *output_data, + int batch_size = -1) override; + + /// + /// \brief Run the prediction engine + /// + /// \return Whether the function executed successfully + /// + bool ZeroCopyRun() override; + + /// + /// \brief Release all tmp tensor to compress the size of the memory pool. + /// The memory pool is considered to be composed of a list of chunks, if + /// the chunk is not occupied, it can be released. + /// + /// \return Number of bytes released. It may be smaller than the actual + /// released memory, because part of the memory is not managed by the + /// MemoryPool. + /// + uint64_t TryShrinkMemory() override; + /// + /// \brief Clone to get the new predictor. thread safe. + /// + /// \return get a new predictor + /// + std::unique_ptr Clone() override; + + std::shared_ptr scope_; + + private: + /// + /// \brief get the Ort Value(input Tensor). + /// + /// \param[in] desc ONNXDesce(name、shape、dtype) + /// + /// \param[in] device_name "cpu" or "gpu" of device + /// + /// \return get a Ort::Value + /// + Ort::Value GetOrtValue(const ONNXDesc &desc, const char *device_name); + + /// + /// \brief Ort::Value to Paddle::ZeroCopyTensor. + /// + /// \param[in] value Ort::Value(output Tensor) + /// + /// \param[in] desc a ONNXDesce(name、shape、dtype) + /// + /// \return get a Ort::Value + /// + void AsTensor(const Ort::Value &value, const ONNXDesc &desc); + + private: + AnalysisConfig config_; + + // ONNXRuntime + Ort::Env env_; + Ort::Session session_{nullptr}; + + platform::Place place_; + framework::Scope *sub_scope_{nullptr}; + std::vector input_desc_; + std::vector output_desc_; + int predictor_id_; + +// Some more detailed tests, they are made the friends of the predictor, so that +// the all the details can be tested. +#if PADDLE_WITH_TESTING + FRIEND_TEST(ONNXRuntimePredictor, onnxruntime_on); +#endif +}; + +} // namespace paddle diff --git a/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..2be2de9c60bb1c3fdedf13212d50a6f4e155d4df --- /dev/null +++ b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/onnxruntime_predictor.h" + +#include +#include +#include +#include // NOLINT +#include +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_api.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" +#include "paddle/fluid/inference/utils/io_utils.h" +#include "paddle/fluid/platform/cpu_info.h" + +DEFINE_string(dirname, "", "dirname to tests."); + +namespace paddle { + +TEST(ONNXRuntimePredictor, onnxruntime_on) { + AnalysisConfig config; + config.SetModel(FLAGS_dirname + "/inference.pdmodel", + FLAGS_dirname + "/inference.pdiparams"); + config.EnableONNXRuntime(); + config.EnableORTOptimization(); + config.SetCpuMathLibraryNumThreads(2); + LOG(INFO) << config.Summary(); + + auto _predictor = + CreatePaddlePredictor(config); + ASSERT_TRUE(_predictor); + auto* predictor = static_cast(_predictor.get()); + + ASSERT_TRUE(predictor); + ASSERT_TRUE(!predictor->Clone()); + ASSERT_TRUE(predictor->scope_); + ASSERT_TRUE(predictor->sub_scope_); + ASSERT_EQ(predictor->scope_->parent(), nullptr); + ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get()); + // Dummy Input Data + std::vector input_shape = {-1, 3, 224, 224}; + std::vector input_data(1 * 3 * 224 * 224, 1.0); + std::vector out_data; + out_data.resize(1000); + + // testing all interfaces + auto input_names = predictor->GetInputNames(); + auto output_names = predictor->GetOutputNames(); + auto get_input_shape = predictor->GetInputTensorShape(); + + ASSERT_EQ(input_names.size(), 1UL); + ASSERT_EQ(output_names.size(), 1UL); + ASSERT_EQ(input_names[0], "inputs"); + ASSERT_EQ(output_names[0], "save_infer_model/scale_0.tmp_1"); + ASSERT_EQ(get_input_shape["inputs"], input_shape); + + auto input_tensor = predictor->GetInputTensor(input_names[0]); + input_tensor->Reshape({1, 3, 224, 224}); + auto output_tensor = predictor->GetOutputTensor(output_names[0]); + + input_tensor->CopyFromCpu(input_data.data()); + ASSERT_TRUE(predictor->ZeroCopyRun()); + output_tensor->CopyToCpu(out_data.data()); + + predictor->TryShrinkMemory(); +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 180c028c6a61088edeb8723891d4de1ba2272b80..7b765e3fa8a24ef1b81b68da8ba12dd8e5577572 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -76,6 +76,54 @@ struct LiteNNAdapterConfig { LiteNNAdapterConfig& Disable(); }; +struct DistConfig { + bool use_dist_model() const { return use_dist_model_; } + void EnableDistModel(bool use_dist_model) { + use_dist_model_ = use_dist_model; + } + + std::vector trainer_endpoints() const { + return trainer_endpoints_; + } + + std::string current_endpoint() const { return current_endpoint_; } + + void SetEndpoints(const std::vector& trainer_endpoints, + const std::string& current_endpoint) { + trainer_endpoints_ = trainer_endpoints; + current_endpoint_ = current_endpoint; + } + + int64_t nranks() const { return nranks_; } + + int64_t rank() const { return rank_; } + + void SetRanks(int64_t nranks, int64_t rank) { + nranks_ = nranks; + rank_ = rank; + } + + std::string comm_init_config() const { return comm_init_config_; } + + void SetCommInitConfig(const std::string& comm_init_config) { + comm_init_config_ = comm_init_config; + } + + void SetCarrierId(const std::string& carrier_id) { carrier_id_ = carrier_id; } + + std::string carrier_id() const { return carrier_id_; } + + protected: + // DistModel Inference related + bool use_dist_model_{false}; // whether use DistModel or not + std::vector trainer_endpoints_{}; // all trainers' endpoints + std::string current_endpoint_{}; // current trainer's endpoint + int64_t nranks_{1}; // total ranks (number of trainers) + int64_t rank_{0}; // rank + std::string comm_init_config_{}; // converter config path + std::string carrier_id_{"inference"}; +}; + /// /// \brief configuration manager for AnalysisPredictor. /// \since 1.7.0 @@ -271,6 +319,18 @@ struct PD_INFER_DECL AnalysisConfig { /// void EnableNpu(int device_id = 0); /// + /// \brief Turn on ONNXRuntime. + /// + void EnableONNXRuntime(); + /// + /// \brief Turn off ONNXRuntime. + /// + void DisableONNXRuntime(); + /// + /// \brief Turn on ONNXRuntime Optimization. + /// + void EnableORTOptimization(); + /// /// \brief A boolean state telling whether the GPU is turned on. /// /// \return bool Whether the GPU is turned on. @@ -294,6 +354,19 @@ struct PD_INFER_DECL AnalysisConfig { /// bool use_ipu() const { return use_ipu_; } /// + /// \brief A boolean state telling whether the ONNXRuntime is turned on. + /// + /// \return bool Whether the ONNXRuntime is turned on. + /// + bool use_onnxruntime() const { return use_onnxruntime_; } + /// + /// \brief A boolean state telling whether the ONNXRuntime Optimization is + /// turned on. + /// + /// \return bool Whether the ONNXRuntime Optimization is turned on. + /// + bool ort_optimization_enabled() const { return enable_ort_optimization_; } + /// /// \brief Get the GPU device id. /// /// \return int The GPU device id. @@ -763,6 +836,12 @@ struct PD_INFER_DECL AnalysisConfig { LiteNNAdapterConfig& NNAdapter() { return nnadapter_config_; } + void SetDistConfig(const DistConfig& dist_config) { + dist_config_ = dist_config; + } + + const DistConfig& dist_config() const { return dist_config_; } + protected: // Update the config. void Update(); @@ -787,6 +866,10 @@ struct PD_INFER_DECL AnalysisConfig { bool use_npu_{false}; int npu_device_id_{0}; + // ONNXRuntime related + bool use_onnxruntime_{false}; + bool enable_ort_optimization_{false}; + // Padding related bool use_fc_padding_{true}; @@ -902,6 +985,9 @@ struct PD_INFER_DECL AnalysisConfig { mutable bool is_valid_{true}; std::string opt_cache_dir_; friend class paddle_infer::experimental::InternalUtils; + + // fleet exe related + DistConfig dist_config_{}; }; } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index c129efe494b4fb36bc72d3c93e24951ba7fef322..657dd9b600cce7173e3aa8d0156ba0975199cf98 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -192,6 +192,7 @@ class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor { private: friend class AnalysisPredictor; + friend class ONNXRuntimePredictor; explicit ZeroCopyTensor(void* scope) : paddle_infer::Tensor{scope} {} }; @@ -381,6 +382,7 @@ enum class PaddleEngineKind { kNative = 0, ///< Use the native Fluid facility. kAutoMixedTensorRT, ///< Automatically mix Fluid with TensorRT. kAnalysis, ///< More optimization. + kONNXRuntime, ///< Use ONNXRuntime }; template @@ -395,6 +397,11 @@ template <> PD_INFER_DECL std::unique_ptr CreatePaddlePredictor< AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config); +template <> +PD_INFER_DECL std::unique_ptr +CreatePaddlePredictor( + const AnalysisConfig& config); + PD_INFER_DECL int PaddleDtypeSize(PaddleDType dtype); PD_INFER_DECL std::string get_version(); diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 313e1f2faea553809cb6fce66ca9a751bace8d75..f5f36d805b43ea0815683e3b65bf157fe5beb2de 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -75,13 +75,11 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { void PaddlePassBuilder::ClearPasses() { passes_.clear(); } const std::vector kTRTSubgraphPasses({ - "conv_affine_channel_fuse_pass", // - "adaptive_pool2d_convert_global_pass", - "conv_eltwiseadd_affine_channel_fuse_pass", // - "shuffle_channel_detect_pass", // - "quant_conv2d_dequant_fuse_pass", // - "delete_quant_dequant_op_pass", // - "delete_quant_dequant_filter_op_pass", // + "adaptive_pool2d_convert_global_pass", + "shuffle_channel_detect_pass", // + "quant_conv2d_dequant_fuse_pass", // + "delete_quant_dequant_op_pass", // + "delete_quant_dequant_filter_op_pass", // // "fc_fuse_pass", // "simplify_with_basic_ops_pass", // "embedding_eltwise_layernorm_fuse_pass", // @@ -134,22 +132,20 @@ const std::vector kLiteSubgraphPasses({ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { passes_.assign({ // "identity_scale_op_clean_pass", // - "is_test_pass", // - "simplify_with_basic_ops_pass", // - "conv_affine_channel_fuse_pass", // - "conv_eltwiseadd_affine_channel_fuse_pass", // - "conv_bn_fuse_pass", // - "conv_eltwiseadd_bn_fuse_pass", // - "embedding_eltwise_layernorm_fuse_pass", // - "multihead_matmul_fuse_pass_v2", // - "gpu_cpu_squeeze2_matmul_fuse_pass", // - "gpu_cpu_reshape2_matmul_fuse_pass", // - "gpu_cpu_flatten2_matmul_fuse_pass", // - "gpu_cpu_map_matmul_v2_to_mul_pass", // - "gpu_cpu_map_matmul_v2_to_matmul_pass", // - "gpu_cpu_map_matmul_to_mul_pass", // - "fc_fuse_pass", // - "fc_elementwise_layernorm_fuse_pass", // + "is_test_pass", // + "simplify_with_basic_ops_pass", // + "conv_bn_fuse_pass", // + "conv_eltwiseadd_bn_fuse_pass", // + "embedding_eltwise_layernorm_fuse_pass", // + "multihead_matmul_fuse_pass_v2", // + "gpu_cpu_squeeze2_matmul_fuse_pass", // + "gpu_cpu_reshape2_matmul_fuse_pass", // + "gpu_cpu_flatten2_matmul_fuse_pass", // + "gpu_cpu_map_matmul_v2_to_mul_pass", // + "gpu_cpu_map_matmul_v2_to_matmul_pass", // + "gpu_cpu_map_matmul_to_mul_pass", // + "fc_fuse_pass", // + "fc_elementwise_layernorm_fuse_pass", // #if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be // guaranteed at least v7 // cudnn8.0 has memory leak problem in conv + eltwise + act, so we @@ -236,14 +232,12 @@ void CpuPassStrategy::EnableMKLDNN() { passes_.insert(passes_.begin(), "mkldnn_placement_pass"); for (auto &pass : std::vector({ - "depthwise_conv_mkldnn_pass", // - "conv_bn_fuse_pass", // Execute BN passes again to - "conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order - "conv_affine_channel_fuse_pass", // - "conv_eltwiseadd_affine_channel_fuse_pass", // - "conv_transpose_bn_fuse_pass", // - "conv_transpose_eltwiseadd_bn_fuse_pass", // - "conv_bias_mkldnn_fuse_pass", // + "depthwise_conv_mkldnn_pass", // + "conv_bn_fuse_pass", // Execute BN passes again to + "conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order + "conv_transpose_bn_fuse_pass", // + "conv_transpose_eltwiseadd_bn_fuse_pass", // + "conv_bias_mkldnn_fuse_pass", // "conv_transpose_bias_mkldnn_fuse_pass", // TODO(baoachun): Need to support 5-dimensional input. // "conv3d_bias_mkldnn_fuse_pass", // diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h index 81eecbb2c1480499b81556c48d021a8ff8929899..5a98d109aed79cc5bcefdc01b47a166bdf9c01d9 100644 --- a/paddle/fluid/inference/api/paddle_tensor.h +++ b/paddle/fluid/inference/api/paddle_tensor.h @@ -47,6 +47,8 @@ enum DataType { enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU, kIPU }; +enum class DataLayout { kUNK = -1, kAny, kNHWC, kNCHW }; + /// \brief Represents an n-dimensional array of values. /// The Tensor is used to store the input or output of the network. /// Zero copy means that the tensor supports direct copy of host or device data @@ -92,6 +94,17 @@ class PD_INFER_DECL Tensor { template void CopyFromCpu(const T* data); + /// \brief Share the data with tensor data. + /// It's usually used to set the tensor data. + /// \param data The pointer of the data, from which the tensor will share. + /// \param shape The shape of data. + /// \param place The place of data. + /// \param layout The layout of data. Only NCHW is supported now. + template + void ShareExternalData(const T* data, const std::vector& shape, + PlaceType place, + DataLayout layout = DataLayout::kNCHW); + /// \brief Experimental interface. /// It's usually used to set the input tensor data with Strings data type. /// \param data The pointer of the data, from which the tensor will copy. diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc index e342190fda1aca53a6814806e1afec1335224d79..d7b07652babbd1e24e2c650ac8ac079f03523d12 100644 --- a/paddle/fluid/inference/capi_exp/pd_config.cc +++ b/paddle/fluid/inference/capi_exp/pd_config.cc @@ -126,6 +126,26 @@ PD_Bool PD_ConfigUseGpu(__pd_keep PD_Config* pd_config) { return config->use_gpu(); } +void PD_ConfigEnableONNXRuntime(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableONNXRuntime(); +} + +void PD_ConfigDisableONNXRuntime(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->DisableONNXRuntime(); +} + +PD_Bool PD_ConfigONNXRuntimeEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->use_onnxruntime(); +} + +void PD_ConfigEnableORTOptimization(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableORTOptimization(); +} + void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config, int32_t l3_workspace_size, PD_Bool locked, PD_Bool autotune, const char* autotune_file, diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h index c314aca918f141d30661d9034656899bbb816063..f6b754cad213f8d5249317468b5ceb21e863f6ad 100644 --- a/paddle/fluid/inference/capi_exp/pd_config.h +++ b/paddle/fluid/inference/capi_exp/pd_config.h @@ -152,6 +152,34 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGpu( PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseGpu( __pd_keep PD_Config* pd_config); /// +/// \brief Turn on ONNXRuntime. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableONNXRuntime( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn off ONNXRuntime. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigDisableONNXRuntime( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the ONNXRutnime is turned on. +/// +/// \return Whether the ONNXRuntime is turned on. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigONNXRuntimeEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on ONNXRuntime Optimization. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableORTOptimization( + __pd_keep PD_Config* pd_config); +/// /// \brief Turn on XPU. /// /// \param[in] pd_onfig config diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go index def26913b0a1c082b3a983cea5fa8021c468b59c..8f9f34c06b4768317d6f710ac49a7610a9ef9d6a 100644 --- a/paddle/fluid/inference/goapi/config.go +++ b/paddle/fluid/inference/goapi/config.go @@ -160,6 +160,36 @@ func (config *Config) EnableUseGpu(memorySize uint64, deviceId int32) { C.PD_ConfigEnableUseGpu(config.c, C.uint64_t(memorySize), C.int32_t(deviceId)) } +/// +/// \brief Turn on ONNXRuntime. +/// +func (config *Config) EnableONNXRuntime() { + C.PD_ConfigEnableONNXRuntime(config.c) +} + +/// +/// \brief Turn off ONNXRuntime. +/// +func (config *Config) DisableONNXRuntime() { + C.PD_ConfigDisableONNXRuntime(config.c) +} + +/// +/// \brief A boolean state telling whether the ONNXRuntime is turned on. +/// +/// \return bool Whether the ONNXRuntime is turned on. +/// +func (config *Config) ONNXRuntimeEnabled() bool { + return cvtPDBoolToGo(C.PD_ConfigONNXRuntimeEnabled(config.c)) +} + +/// +/// \brief Turn on ONNXRuntime Optimization. +/// +func (config *Config) EnableORTOptimization() { + C.PD_ConfigEnableORTOptimization(config.c) +} + /// /// \brief Turn on XPU. /// diff --git a/paddle/fluid/inference/goapi/config_test.go b/paddle/fluid/inference/goapi/config_test.go index b82161880839e500a20b787914e2827da151106b..297841dcbcf6c19aef4a536557ec30e76ea9f82c 100644 --- a/paddle/fluid/inference/goapi/config_test.go +++ b/paddle/fluid/inference/goapi/config_test.go @@ -122,3 +122,20 @@ func TestMkldnn(t *testing.T) { config.SetBfloat16Op([]string{"fc", "mul"}) } + +func TestONNXRuntime(t *testing.T) { + config := NewConfig() + config.SetModelDir("modelDir") + t.Log(config.ModelDir()) + + config.EnableONNXRuntime() + t.Logf("ONNXRuntimeEnabled:%+v", config.ONNXRuntimeEnabled()) + + config.DisableONNXRuntime() + t.Logf("ONNXRuntimeEnabled:%+v", config.ONNXRuntimeEnabled()) + + config.EnableORTOptimization() + + config.SetCpuMathLibraryNumThreads(4) + t.Logf("CpuMathLibraryNumThreads:%+v", config.CpuMathLibraryNumThreads()) +} \ No newline at end of file diff --git a/paddle/fluid/inference/goapi/predictor_test.go b/paddle/fluid/inference/goapi/predictor_test.go index 40e518304510c57fec9cd7609ecbd6eefa456050..755558f96238d11842f8245c2b36210c60d8a057 100644 --- a/paddle/fluid/inference/goapi/predictor_test.go +++ b/paddle/fluid/inference/goapi/predictor_test.go @@ -66,6 +66,42 @@ func TestNewPredictor(t *testing.T) { cloned.ClearIntermediateTensor() } +func TestONNXRuntimePredictor(t *testing.T) { + t.Logf("Version:\n%+v", Version()) + config := NewConfig() + config.SetModel("./mobilenetv1/inference.pdmodel", "./mobilenetv1/inference.pdiparams") + config.EnableONNXRuntime() + config.EnableORTOptimization() + predictor := NewPredictor(config) + inNames := predictor.GetInputNames() + t.Logf("InputNames:%+v", inNames) + outNames := predictor.GetOutputNames() + t.Logf("OutputNames:%+v", outNames) + + inHandle := predictor.GetInputHandle(inNames[0]) + inHandle.Reshape([]int32{1, 3, 224, 224}) + t.Logf("inHandle name:%+v, shape:%+v", inHandle.Name(), inHandle.Shape()) + + data := make([]float32, numElements([]int32{1, 3, 224, 224})) + for i := 0; i < int(numElements([]int32{1, 3, 224, 224})); i++ { + data[i] = float32(i%255) * 0.1 + } + inHandle.CopyFromCpu(data) + t.Logf("inHandle Type:%+v", inHandle.Type()) + + predictor.Run() + + outHandle := predictor.GetOutputHandle(outNames[0]) + t.Logf("outHandle name:%+v", outHandle.Name()) + + outShape := outHandle.Shape() + t.Logf("outHandle Shape:%+v", outShape) + outData := make([]float32, numElements(outShape)) + outHandle.CopyToCpu(outData) + t.Log(outData) +} + + func TestFromBuffer(t *testing.T) { modelFile, err := os.Open("./mobilenetv1/inference.pdmodel") if err != nil { diff --git a/paddle/fluid/inference/goapi/test.sh b/paddle/fluid/inference/goapi/test.sh index edccc2648c012fda9e22c2fc14ffe4f90dc26cfe..cff9fd4aa7ceada2a37d9650c9ce3653f0155447 100644 --- a/paddle/fluid/inference/goapi/test.sh +++ b/paddle/fluid/inference/goapi/test.sh @@ -22,6 +22,7 @@ fi # 2. set LD_LIBRARY_PATH export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/mklml/lib/:$PWD/paddle_inference_c/third_party/install/mkldnn/lib/:$PWD/paddle_inference_c/paddle/lib/ +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/onnxruntime/lib/:$PWD/paddle_inference_c/third_party/install/paddle2onnx/lib/ # 3. go test go clean -testcache diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc index 8c61200f7f57cdf57b372c37c8f7cea40c4a8d4c..b69292827aa136fd1d8a1f66d80823e6344a6174 100644 --- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc @@ -89,5 +89,5 @@ class DropoutOpConverter : public OpConverter { } // namespace inference } // namespace paddle -USE_OP(dropout); +USE_OP_ITSELF(dropout); REGISTER_TRT_OP_CONVERTER(dropout, DropoutOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index a432ff62810aa30c01c1980c80bf3f344039f7dd..f19b21d3e632633d7066c3e9e14cadd2900eb339 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -335,15 +335,37 @@ class MultiheadMatMulOpConverter : public OpConverter { reshape_before_fc_dim.d[4] = 1; auto* reshape_before_fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); + if (enable_int8) { + engine_->SetTensorDynamicRange(reshape_before_fc_layer->getOutput(0), + in_scale); + } reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim); reshape_before_fc_layer->setName( ("shuffle_before_multihead_mamul(Output: " + output_name + ")") .c_str()); // add layer fc - auto* fc_layer = TRT_ENGINE_ADD_LAYER( - engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), n, - weight.get(), bias.get()); + nvinfer1::ILayer* fc_layer = nullptr; + if (enable_int8) { + nvinfer1::DimsHW nv_ksize(1, 1); + fc_layer = TRT_ENGINE_ADD_LAYER( + engine_, Convolution, *reshape_before_fc_layer->getOutput(0), n, + nv_ksize, weight.get(), bias.get()); + } else { + fc_layer = TRT_ENGINE_ADD_LAYER( + engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), + n, weight.get(), bias.get()); + } + + if (enable_int8) { + PADDLE_ENFORCE_EQ( + op_desc.HasAttr("fc_out_threshold"), true, + platform::errors::InvalidArgument( + "must have out threshold in multihead layers in int8 mode")); + float out_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("fc_out_threshold")); + engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale); + } fc_layer->setName( ("multihead_mamul_fc(Output: " + output_name + ")").c_str()); @@ -359,6 +381,10 @@ class MultiheadMatMulOpConverter : public OpConverter { plugin_inputs.push_back(input_bias_qk); bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + + if (enable_int8) { + with_fp16 = 1; + } plugin::DynamicPluginTensorRT* plugin = new plugin::QkvToContextPluginDynamic(hidden_in, head_number, head_size, scale, with_fp16); diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc index 71c4348685e1b01e158aa298c48953fc3a354cec..753cd70727643d660bb1ffd3607706613f595c78 100644 --- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc @@ -105,7 +105,7 @@ class SkipLayerNormOpConverter : public OpConverter { "in CustomSkipLayerNormPluginDynamic hidden " "dimension should > 0")); if (enable_int8) { - type = static_cast(nvinfer1::DataType::kINT8); + type = static_cast(nvinfer1::DataType::kHALF); } const std::vector fields{ diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index f2dc5ba1c7c2c832e0239f6a30760c354aaf4699..7f7313fbcb5969aafea47ad23248acd5a6ca3644 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -52,7 +52,7 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); } } // namespace inference } // namespace paddle -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP(sigmoid); USE_OP(tanh); USE_OP(relu6); diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc index 95916746d6fcb528d26a8f8bb39980b55c4f3704..b96992ef8514abe0f71dbf23d38abb626f6c4a5b 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" -USE_OP(conv2d); +USE_OP_ITSELF(conv2d); USE_OP(conv2d_transpose); namespace paddle { diff --git a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc index 474fd92071fb0795b868f0cd86591061cf8b6581..cf377396087637f115523ddc60a468e2a23d57d4 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc @@ -57,4 +57,4 @@ TEST(DropoutOpConverter, main) { } // namespace inference } // namespace paddle -USE_OP(dropout); +USE_OP_ITSELF(dropout); diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index b2764ca61c11219e5546867813157b7f05ee3ce8..d53a8923af6120adb460d95fc81820b6dfa03a60 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -54,6 +54,8 @@ TRT_DT FluidDataType2TRT(FluidDT type) { return TRT_DT::kFLOAT; case FluidDT::VarType_Type_INT32: return TRT_DT::kINT32; + case FluidDT::VarType_Type_FP16: + return TRT_DT::kHALF; default: return TRT_DT::kINT32; } diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu index 57177cfa8b421e1d79004bb1a7f738d98dc00f97..336005d883b0f523213060645e540c35a14e4e9c 100644 --- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu @@ -16,7 +16,6 @@ #include #include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h" -#include "paddle/fluid/operators/detection/yolo_box_op.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 37214534f3c937bcf62bb34b51da2c934c566ced..8c96499a022f7e9f0d1fd8c512070592cf7428ff 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -453,6 +453,23 @@ if(WITH_MKLDNN) download_int8_data_without_verify(${INT8_GOOGLENET_MODEL_DIR} "GoogleNet_int8_model.tar.gz" ) inference_analysis_api_int8_test_run_custom_warmup_batch_size(test_analyzer_int8_googlenet ${INT8_IMG_CLASS_TEST_APP} ${INT8_GOOGLENET_MODEL_DIR} ${IMAGENET_DATA_PATH} 10) + # mobilenetv3_large_x1_0 int8 + set(INT8_MOBILENETV3_LARGE_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv3_large") + set(INT8_MOBILENETV3_FILE_NAME "MobileNetV3_large_x1_0_infer.tar") + if (NOT EXISTS ${INT8_MOBILENETV3_LARGE_MODEL_DIR}/${INT8_MOBILENETV3_FILE_NAME}) + inference_download_and_uncompress_without_verify(${INT8_MOBILENETV3_LARGE_MODEL_DIR} "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/" ${INT8_MOBILENETV3_FILE_NAME}) + endif() + inference_analysis_test_run(test_analyzer_int8_mobilenetv3_large + COMMAND ${INT8_IMG_CLASS_TEST_APP} + ARGS --infer_model=${INT8_MOBILENETV3_LARGE_MODEL_DIR}/MobileNetV3_large_x1_0_infer + --infer_data=${IMAGENET_DATA_PATH} + --warmup_batch_size=50 + --batch_size=1 + --enable_int8=true + --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} + --iterations=100 + --with_accuracy_layer=false) + ### BFLOAT16 tests # build test binary to be used in subsequent tests @@ -472,6 +489,17 @@ if(WITH_MKLDNN) # mobilenetv2 bfloat16 inference_analysis_api_bfloat16_test_run(test_analyzer_bfloat16_mobilenetv2 ${BF16_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV2_MODEL_DIR} ${IMAGENET_DATA_PATH}) + # mobilenetv3_large + inference_analysis_test_run(test_analyzer_bfloat16_mobilenetv3_large + COMMAND ${BF16_IMG_CLASS_TEST_APP} + ARGS --infer_model=${INT8_MOBILENETV3_LARGE_MODEL_DIR}/MobileNetV3_large_x1_0_infer + --infer_data=${IMAGENET_DATA_PATH} + --batch_size=1 + --enable_bf16=true + --paddle_num_threads=${CPU_NUM_THREADS_ON_CI} + --iterations=100 + --with_accuracy_layer=false) + ### Object detection models set(PASCALVOC_DATA_PATH "${INT8_DATA_DIR}/pascalvoc_val_head_300.bin") set(INT8_OBJ_DETECT_TEST_APP "test_analyzer_int8_object_detection") @@ -692,6 +720,12 @@ inference_analysis_test(test_analyzer_zerocopytensor_tensor SRCS analyzer_zeroco EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${OCR_INSTALL_DIR}/model) +if(WITH_DISTRIBUTE AND WITH_PSCORE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL)) + inference_analysis_test(test_analyzer_dist_model SRCS analyzer_dist_model_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${OCR_INSTALL_DIR}/model) +endif() + inference_analysis_test(test_analyzer_paddletensor_tensor SRCS analyzer_paddle_tensor_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${OCR_INSTALL_DIR}/model --infer_data=${OCR_INSTALL_DIR}/data.txt --refer_result=${OCR_INSTALL_DIR}/result.txt) @@ -739,6 +773,7 @@ if(WITH_MKLDNN) set_tests_properties(test_analyzer_quant_performance_benchmark PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_int8_mobilenetv2 PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_int8_mobilenetv1 PROPERTIES TIMEOUT 120) + set_tests_properties(test_analyzer_int8_mobilenetv3_large PROPERTIES TIMEOUT 120) endif() set_tests_properties(lite_resnet50_test PROPERTIES TIMEOUT 120) diff --git a/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc index 3b16b0d34fd4cb87879bb6ed585e72b48167ac2c..f267f0f28d685e51f0359a345c52fbbe4a49fa16 100644 --- a/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc @@ -14,13 +14,19 @@ limitations under the License. */ #include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" +#include "paddle/fluid/platform/cpu_info.h" namespace paddle { namespace inference { namespace analysis { void SetConfig(AnalysisConfig *cfg) { - cfg->SetModel(FLAGS_infer_model); + std::ifstream model_file(FLAGS_infer_model + "/__model__"); + if (model_file.good()) + cfg->SetModel(FLAGS_infer_model); + else + cfg->SetModel(FLAGS_infer_model + "/inference.pdmodel", + FLAGS_infer_model + "/inference.pdiparams"); cfg->DisableGpu(); cfg->SwitchIrOptim(); cfg->SwitchSpecifyInputNames(); @@ -38,7 +44,12 @@ TEST(Analyzer_bfloat16_image_classification, bfloat16) { // read data from file and prepare batches with test data std::vector> input_slots_all; SetInputs(&input_slots_all); - b_cfg.EnableMkldnnBfloat16(); + if (FLAGS_enable_bf16 && + platform::MayIUse(platform::cpu_isa_t::avx512_bf16)) { + b_cfg.EnableMkldnnBfloat16(); + } else { + FLAGS_enable_bf16 = false; + } CompareBFloat16AndAnalysis(&cfg, &b_cfg, input_slots_all); } diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc index df0eb58c2bd587e69215602512cc51f19c97a978..a341ffd7a081c24500e3b061b0ce3510a2aaacbc 100644 --- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc @@ -81,6 +81,18 @@ TEST(PD_Config, interface) { PD_ConfigSetBfloat16Op(config, 1, &ops_name); #endif + PD_ConfigEnableONNXRuntime(config); + bool onnxruntime_enabled = PD_ConfigONNXRuntimeEnabled(config); +#ifdef PADDLE_WITH_ONNXRUNTIME + EXPECT_TRUE(onnxruntime_enabled); +#else + EXPECT_FALSE(onnxruntime_enabled); +#endif + PD_ConfigDisableONNXRuntime(config); + bool onnxruntime_disabled = PD_ConfigONNXRuntimeEnabled(config); + EXPECT_FALSE(onnxruntime_disabled); + PD_ConfigEnableORTOptimization(config); + PD_ConfigEnableMemoryOptim(config, true); bool memory_enabled = PD_ConfigMemoryOptimEnabled(config); EXPECT_TRUE(memory_enabled); diff --git a/paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..7cf6e2adfc688f70e0ed31f7c1f5305206aa1702 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" +#include "paddle/fluid/inference/utils/singleton.h" + +namespace paddle { +namespace inference { + +TEST(test_dist_model, dist_model) { + std::cout << "Analysis Predictor DistModel test." << std::endl; + AnalysisConfig config; + config.SetModel(FLAGS_infer_model + "/__model__", + FLAGS_infer_model + "/__params__"); + config.SwitchUseFeedFetchOps(false); + config.EnableUseGpu(100, 0); + DistConfig dist_config; + dist_config.SetRanks(1, 0); + dist_config.EnableDistModel(true); + dist_config.SetEndpoints({""}, ""); + config.SetDistConfig(dist_config); + + auto predictor = paddle_infer::CreatePredictor(config); + int batch_size = 1; + int channels = 1; + int height = 48; + int width = 512; + int nums = batch_size * channels * height * width; + std::cout << "Created predictor." << std::endl; + + float* input = new float[nums]; + for (int i = 0; i < nums; ++i) input[i] = 0; + auto input_names = predictor->GetInputNames(); + + auto input_t = predictor->GetInputHandle(input_names[0]); + input_t->Reshape({batch_size, channels, height, width}); + input_t->CopyFromCpu(input); + std::cout << "Input data." << std::endl; + + predictor->Run(); + std::cout << "Zero Copy Run." << std::endl; + + std::vector out_data; + auto output_names = predictor->GetOutputNames(); + auto output_t = predictor->GetOutputHandle(output_names[0]); + std::vector output_shape = output_t->shape(); + int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()); + out_data.resize(out_num); + output_t->CopyToCpu(out_data.data()); + std::cout << "Output data." << std::endl; + delete[] input; +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc index 8f8b73044232a5cacfa3609e5f8e32ccf375d418..b07163b518b529e7ab01107e1f0d217443f574bd 100644 --- a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc @@ -22,7 +22,12 @@ namespace inference { namespace analysis { void SetConfig(AnalysisConfig *cfg) { - cfg->SetModel(FLAGS_infer_model); + std::ifstream model_file(FLAGS_infer_model + "/__model__"); + if (model_file.good()) + cfg->SetModel(FLAGS_infer_model); + else + cfg->SetModel(FLAGS_infer_model + "/inference.pdmodel", + FLAGS_infer_model + "/inference.pdiparams"); cfg->DisableGpu(); cfg->SwitchIrOptim(); cfg->SwitchSpecifyInputNames(); diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 637fa16e31ba7996713a6971c3a1802627811e7f..e63dfd14175b9955fbf5b6fdb0fb7904a330f264 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -213,15 +213,15 @@ std::shared_ptr> GetWarmupData( element_in_batch * 3 * 224 * 224, 3 * 224 * 224, static_cast(images.data.data()) + i * 3 * 224 * 224); - - std::copy_n(static_cast(test_data[batch][1].data.data()) + - element_in_batch, - 1, static_cast(labels.data.data()) + i); + if (FLAGS_with_accuracy_layer) + std::copy_n(static_cast(test_data[batch][1].data.data()) + + element_in_batch, + 1, static_cast(labels.data.data()) + i); } - - auto warmup_data = std::make_shared>(2); + auto warmup_data = std::make_shared>( + FLAGS_with_accuracy_layer ? 2 : 1); (*warmup_data)[0] = std::move(images); - (*warmup_data)[1] = std::move(labels); + if (FLAGS_with_accuracy_layer) (*warmup_data)[1] = std::move(labels); return warmup_data; } @@ -254,9 +254,13 @@ void SetInputs(std::vector> *inputs, } for (auto i = 0; i < iterations; i++) { auto images = image_reader.NextBatch(); - auto labels = label_reader.NextBatch(); - inputs->emplace_back( - std::vector{std::move(images), std::move(labels)}); + std::vector tmp_vec; + tmp_vec.push_back(std::move(images)); + if (FLAGS_with_accuracy_layer) { + auto labels = label_reader.NextBatch(); + tmp_vec.push_back(std::move(labels)); + } + inputs->push_back(std::move(tmp_vec)); } } @@ -825,7 +829,8 @@ void CompareQuantizedAndAnalysis( SummarizePerformance("FP32", sample_latency_fp32, "INT8", sample_latency_int8); - CompareAccuracy(quantized_outputs, analysis_outputs, compared_idx); + if (FLAGS_with_accuracy_layer) + CompareAccuracy(quantized_outputs, analysis_outputs, compared_idx); } void CompareBFloat16AndAnalysis( @@ -864,7 +869,8 @@ void CompareBFloat16AndAnalysis( SummarizePerformance("FP32", sample_latency_fp32, "BF16", sample_latency_bf16); - CompareAccuracy(bf16_outputs, analysis_outputs, compared_idx); + if (FLAGS_with_accuracy_layer) + CompareAccuracy(bf16_outputs, analysis_outputs, compared_idx); } void CompareAnalysisAndAnalysis( diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt index 9d83f8ff8fdc4756450c0fe9ae4d7096d9afa76f..f376cbd4fb302b1d7a038d958465f24db653e220 100644 --- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt +++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt @@ -5,6 +5,7 @@ option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." OFF) option(USE_TENSORRT "Compile demo with TensorRT." OFF) option(WITH_GTEST "Compile demo with GTEST" OFF) +option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime" OFF) if(NOT WITH_STATIC_LIB) add_definitions("-DPADDLE_WITH_SHARED_LIB") @@ -45,6 +46,13 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") link_directories("${PADDLE_LIB}/paddle/lib") +if (WITH_ONNXRUNTIME) + include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include") + include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include") + + link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib") + link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib") +endif() if (WIN32) add_definitions("/DGOOGLE_GLOG_DLL_DECL=") @@ -172,6 +180,16 @@ else() endif() endif() +if (WITH_ONNXRUNTIME) + if(WIN32) + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx) + elseif(APPLE) + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx) + else() + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx) + endif() +endif() + if (NOT WIN32) set(EXTERNAL_LIB "-lrt -ldl -lpthread") set(DEPS ${DEPS} @@ -248,6 +266,14 @@ if(WIN32) COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release ) endif() + if(WITH_ONNXRUNTIME) + add_custom_command(TARGET ${DEMO_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + ) + endif() if(NOT WITH_STATIC_LIB) add_custom_command(TARGET ${DEMO_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} diff --git a/paddle/fluid/inference/tests/infer_ut/run.sh b/paddle/fluid/inference/tests/infer_ut/run.sh index dd4b64f28d739776ee750205d41b4dce35a97640..8123d3785003471fd5f63f24fbb1166913d7e571 100755 --- a/paddle/fluid/inference/tests/infer_ut/run.sh +++ b/paddle/fluid/inference/tests/infer_ut/run.sh @@ -20,7 +20,8 @@ TURN_ON_MKL=$2 # use MKL or Openblas TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode DATA_DIR=$4 # dataset TENSORRT_ROOT_DIR=$5 # TensorRT ROOT dir, default to /usr/local/TensorRT -MSVC_STATIC_CRT=$6 +WITH_ONNXRUNTIME=$6 +MSVC_STATIC_CRT=$7 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir EXIT_CODE=0 # init default exit code WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform @@ -144,7 +145,8 @@ function compile_test() { -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \ -DWITH_GTEST=ON \ -DCMAKE_CXX_FLAGS='/std:c++17' \ - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=Release \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME msbuild /maxcpucount /property:Configuration=Release ALL_BUILD.vcxproj else cmake .. -DPADDLE_LIB=${inference_install_dir} \ @@ -154,7 +156,8 @@ function compile_test() { -DWITH_STATIC_LIB=OFF \ -DUSE_TENSORRT=$USE_TENSORRT \ -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ - -DWITH_GTEST=ON + -DWITH_GTEST=ON \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME make -j$(nproc) fi; cd - diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake index 05c468b798886ac135ed30bff75ce9400f1ca3a1..6b6c0cd22f03b902f08d7a79236b1091b9fe6677 100644 --- a/paddle/fluid/inference/tests/test.cmake +++ b/paddle/fluid/inference/tests/test.cmake @@ -80,6 +80,14 @@ if(NOT EXISTS ${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inferenc endif() set(IMG_CLS_RESNET_MODEL_DIR "${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model") +if(WITH_ONNXRUNTIME) + set(MOBILENETV2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/MobileNetV2") + if(NOT EXISTS ${MOBILENETV2_INSTALL_DIR}/MobileNetV2.inference.model.tar.gz) + inference_download_and_uncompress_without_verify(${MOBILENETV2_INSTALL_DIR} ${INFERENCE_URL} "MobileNetV2.inference.model.tar.gz") + endif() + set(MOBILENETV2_MODEL_DIR "${MOBILENETV2_INSTALL_DIR}/MobileNetV2") +endif() + function (inference_base_test_build TARGET) set(options "") set(oneValueArgs "") diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 6cd7d87332323f4bafd49b8b16254f9610405658..a7a417c29a7bdb7a47d4798246de55c0bd3536f9 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -17,7 +17,7 @@ if (WITH_GPU) nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard) nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info) nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) - nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator) + nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator cuda_graph) nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator) cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 4d0e485285146e5668793d29fd8effc789fcc339..61e292a922f0e98a958d4fe2f8fc7850bdf47e18 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -193,10 +193,10 @@ class AllocatorFacadePrivate { } #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE - auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); + auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); for (const auto& dev_type : device_types) { for (size_t dev_id = 0; - dev_id < platform::DeviceManager::GetDeviceCount(dev_type); + dev_id < phi::DeviceManager::GetDeviceCount(dev_type); ++dev_id) { InitNaiveBestFitCustomDeviceAllocator( platform::CustomPlace(dev_type, dev_id)); @@ -210,12 +210,7 @@ class AllocatorFacadePrivate { InitNaiveBestFitCPUAllocator(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) allow_free_idle_chunk_ = allow_free_idle_chunk; - if (FLAGS_use_stream_safe_cuda_allocator) { - for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); - ++dev_id) { - InitStreamSafeCUDAAllocator(platform::CUDAPlace(dev_id), nullptr); - } - } else { + if (!FLAGS_use_stream_safe_cuda_allocator) { for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), @@ -240,10 +235,10 @@ class AllocatorFacadePrivate { } #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE - auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); + auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); for (const auto& dev_type : device_types) { for (size_t dev_id = 0; - dev_id < platform::DeviceManager::GetDeviceCount(dev_type); + dev_id < phi::DeviceManager::GetDeviceCount(dev_type); ++dev_id) { InitAutoGrowthCustomDeviceAllocator( platform::CustomPlace(dev_type, dev_id), allow_free_idle_chunk); @@ -298,6 +293,12 @@ class AllocatorFacadePrivate { } CheckAllocThreadSafe(); + +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { + WrapCUDAGraphAllocator(); + } +#endif } inline const std::shared_ptr& GetAllocator( @@ -388,39 +389,6 @@ class AllocatorFacadePrivate { allocation.get())); return stream_safe_cuda_allocation->GetOwningStream(); } - -#ifdef PADDLE_WITH_CUDA - void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { - PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth, - platform::errors::InvalidArgument( - "CUDA Graph is only supported when the " - "FLAGS_allocator_strategy=\"auto_growth\", but got " - "FLAGS_allocator_strategy=\"%s\"", - FLAGS_allocator_strategy)); - auto& allocator = cuda_graph_allocator_map_[id]; - PADDLE_ENFORCE_EQ( - allocator.get(), nullptr, - platform::errors::InvalidArgument( - "The memory pool of the CUDA Graph with ID %d have been prepared.", - id)); - allocator.reset( - new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); - for (auto& item : allocator->allocators_) { - auto& old_allocator = item.second; - old_allocator = CUDAGraphAllocator::Create(old_allocator); - } - VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; - } - - void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { - auto iter = cuda_graph_allocator_map_.find(id); - PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(), - platform::errors::InvalidArgument( - "Cannot find CUDA Graph with ID = %d", id)); - cuda_graph_allocator_map_.erase(iter); - VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id; - } -#endif #endif private: @@ -439,24 +407,7 @@ class AllocatorFacadePrivate { platform::Place place_; }; - const AllocatorMap& GetAllocatorMap() { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { - auto id = platform::CUDAGraph::CapturingID(); - auto iter = cuda_graph_allocator_map_.find(id); - PADDLE_ENFORCE_NE( - iter, cuda_graph_allocator_map_.end(), - platform::errors::PermissionDenied( - "No memory pool is prepared for CUDA Graph capturing.")); - VLOG(10) << "Choose CUDA Graph memory pool to allocate memory"; - return iter->second->allocators_; - } else { - return allocators_; - } -#else - return allocators_; -#endif - } + const AllocatorMap& GetAllocatorMap() { return allocators_; } void InitNaiveBestFitCPUAllocator() { allocators_[platform::CPUPlace()] = @@ -672,10 +623,10 @@ class AllocatorFacadePrivate { } void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { - const std::shared_ptr& underlying_allocator = - cuda_allocators_[p][stream]; - cuda_allocators_[p][stream] = std::make_shared( - underlying_allocator, p, stream); + std::shared_ptr& allocator = cuda_allocators_[p][stream]; + allocator = std::make_shared( + allocator, p, stream, + /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_); } void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream, @@ -684,10 +635,19 @@ class AllocatorFacadePrivate { retry_time, 0, platform::errors::InvalidArgument( "Retry time should be larger than 0, but got %d", retry_time)); - std::shared_ptr allocator = cuda_allocators_[p][stream]; + std::shared_ptr& allocator = cuda_allocators_[p][stream]; allocator = std::make_shared(allocator, retry_time); } +#ifdef PADDLE_WITH_CUDA + void WrapCUDAGraphAllocator() { + for (auto& item : allocators_) { + auto& allocator = item.second; + allocator = CUDAGraphAllocator::Create(allocator); + } + } +#endif + static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) { for (auto& place_pair : allocators) { for (auto& stream_pair : place_pair.second) { @@ -738,7 +698,7 @@ class AllocatorFacadePrivate { auto custom_allocator = std::make_shared(p); allocators_[p] = std::make_shared( - custom_allocator, platform::DeviceManager::GetMinChunkSize(p), + custom_allocator, phi::DeviceManager::GetMinChunkSize(p), allow_free_idle_chunk); } #endif @@ -814,11 +774,10 @@ class AllocatorFacadePrivate { } #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE - auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); + auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); for (const auto& dev_type : device_types) { for (size_t dev_id = 0; - dev_id < platform::DeviceManager::GetDeviceCount(dev_type); - dev_id++) { + dev_id < phi::DeviceManager::GetDeviceCount(dev_type); dev_id++) { places.emplace_back(platform::CustomPlace(dev_type, dev_id)); } } @@ -865,10 +824,6 @@ class AllocatorFacadePrivate { // a standalone CUDA allocator to support multi-stream GC in new executor CUDAAllocatorMap cuda_allocators_; std::shared_timed_mutex cuda_allocator_mutex_; -#ifdef PADDLE_WITH_CUDA - std::unordered_map> - cuda_graph_allocator_map_; -#endif #endif AllocatorStrategy strategy_; AllocatorMap allocators_; @@ -887,8 +842,24 @@ AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {} AllocatorFacade::~AllocatorFacade() {} AllocatorFacade& AllocatorFacade::Instance() { - static AllocatorFacade instance; - return instance; + static AllocatorFacade* instance = new AllocatorFacade; + return *instance; +} + +AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { + auto id = platform::CUDAGraph::CapturingID(); + auto iter = cuda_graph_map_.find(id); + PADDLE_ENFORCE_NE( + iter, cuda_graph_map_.end(), + platform::errors::PermissionDenied( + "No memory pool is prepared for CUDA Graph capturing.")); + VLOG(10) << "Choose CUDA Graph memory pool"; + return iter->second.get(); + } +#endif + return m_; } const std::shared_ptr& AllocatorFacade::GetAllocator( @@ -896,19 +867,14 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - return m_->GetAllocator(place, - /* A non-zero num to choose allocator_ */ 1); - } -#endif - + AllocatorFacadePrivate* m = GetPrivate(); platform::CUDAPlace cuda_place(place.GetDeviceId()); - return m_->GetAllocator(cuda_place, m_->GetDefaultStream(cuda_place)); + return m->GetAllocator(cuda_place, m->GetDefaultStream(cuda_place)); } #endif - return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); + return GetPrivate()->GetAllocator( + place, /* A non-zero num to choose allocator_ */ 1); } void* AllocatorFacade::GetBasePtr( @@ -923,7 +889,7 @@ void* AllocatorFacade::GetBasePtr( "GetBasePtr() is only implemented for CUDAPlace(), not " "suppot place: %s", allocation->place())); - return m_->GetBasePtr(allocation); + return GetPrivate()->GetBasePtr(allocation); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -931,21 +897,17 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( const platform::Place& place, const gpuStream_t& stream) { if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - return m_->GetAllocator(place, - /* A non-zero num to choose allocator_ */ 1); - } -#endif - return m_->GetAllocator(place, stream, /*create_if_not_found=*/true); + return GetPrivate()->GetAllocator(place, stream, + /*create_if_not_found=*/true); } - return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); + return GetPrivate()->GetAllocator( + place, /* A non-zero num to choose allocator_ */ 1); } #endif const std::shared_ptr& AllocatorFacade::GetZeroAllocator( const platform::Place& place) { - return m_->GetAllocator(place, /* zero size */ 0); + return GetPrivate()->GetAllocator(place, /* zero size */ 0); } std::shared_ptr AllocatorFacade::AllocShared( @@ -958,43 +920,30 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && size > 0 && FLAGS_use_system_allocator == false) { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - return m_->GetAllocator(place, size)->Allocate(size); - } -#endif - platform::CUDAPlace cuda_place(place.GetDeviceId()); - return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place)); + phi::Stream default_stream = phi::Stream(reinterpret_cast( + GetPrivate()->GetDefaultStream(cuda_place))); + return Alloc(cuda_place, size, default_stream); } #endif - - return m_->GetAllocator(place, size)->Allocate(size); + return GetPrivate()->GetAllocator(place, size)->Allocate(size); } uint64_t AllocatorFacade::Release(const platform::Place& place) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - return m_ - ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) - ->Release(place); - } -#endif - platform::CUDAPlace cuda_place(place.GetDeviceId()); - return Release(cuda_place, m_->GetDefaultStream(cuda_place)); + return Release(cuda_place, GetPrivate()->GetDefaultStream(cuda_place)); } #endif - return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) + return GetPrivate() + ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) ->Release(place); } std::shared_ptr AllocatorFacade::AllocShared( const platform::Place& place, size_t size, const phi::Stream& stream) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_EQ( FLAGS_use_stream_safe_cuda_allocator, true, platform::errors::Unimplemented( @@ -1002,71 +951,53 @@ std::shared_ptr AllocatorFacade::AllocShared( "multi-stream 'AllocaShared' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); - -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); - } -#endif - gpuStream_t s = reinterpret_cast(stream.id()); - return std::shared_ptr(Alloc(place, size, s)); -#else - PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU.")); -#endif + return std::shared_ptr(Alloc(place, size, stream)); } -bool AllocatorFacade::InSameStream( - const std::shared_ptr& allocation, - const phi::Stream& stream) { +AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, + const phi::Stream& stream) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_EQ( FLAGS_use_stream_safe_cuda_allocator, true, platform::errors::Unimplemented( "StreamSafeCUDAAllocator is disabled, you should not call this " - "multi-stream 'InSameStream' function. To enable it, you can enter" + "multi-stream 'Alloc' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); + platform::CUDAPlace p(place.GetDeviceId()); + if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { + gpuStream_t s = reinterpret_cast(stream.id()); + return GetPrivate() + ->GetAllocator(p, s, /* create_if_not_found = */ true) + ->Allocate(size); + } else { + return GetPrivate()->GetAllocator(p, size)->Allocate(size); } -#endif - gpuStream_t s = reinterpret_cast(stream.id()); - return s == GetStream(allocation); #else PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU.")); #endif } +bool AllocatorFacade::InSameStream( + const std::shared_ptr& allocation, + const phi::Stream& stream) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, - const gpuStream_t& stream) { PADDLE_ENFORCE_EQ( FLAGS_use_stream_safe_cuda_allocator, true, platform::errors::Unimplemented( "StreamSafeCUDAAllocator is disabled, you should not call this " - "multi-stream 'Alloc' function. To enable it, you can enter" + "multi-stream 'InSameStream' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); - -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); - } + gpuStream_t s = reinterpret_cast(stream.id()); + return s == GetStream(allocation); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU.")); #endif - platform::CUDAPlace p(place.GetDeviceId()); - if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { - return m_->GetAllocator(p, stream, /* create_if_not_found = */ true) - ->Allocate(size); - } else { - return m_->GetAllocator(p, size)->Allocate(size); - } } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, const gpuStream_t& stream) { PADDLE_ENFORCE_EQ( @@ -1076,15 +1007,7 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, "multi-stream 'Release' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); - -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); - } -#endif - - return m_->GetAllocator(place, stream)->Release(place); + return GetPrivate()->GetAllocator(place, stream)->Release(place); } void AllocatorFacade::RecordStream(std::shared_ptr allocation, @@ -1096,15 +1019,7 @@ void AllocatorFacade::RecordStream(std::shared_ptr allocation, "'RecordStream' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); - -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); - } -#endif - - m_->RecordStream(allocation, stream); + GetPrivate()->RecordStream(allocation, stream); } const gpuStream_t& AllocatorFacade::GetStream( @@ -1116,24 +1031,34 @@ const gpuStream_t& AllocatorFacade::GetStream( "'GetStream' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); - -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); - } -#endif - - return m_->GetStream(allocation); + return GetPrivate()->GetStream(allocation); } #ifdef PADDLE_WITH_CUDA void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { - return m_->PrepareMemoryPoolForCUDAGraph(id); + PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth, + platform::errors::InvalidArgument( + "CUDA Graph is only supported when the " + "FLAGS_allocator_strategy=\"auto_growth\", but got " + "FLAGS_allocator_strategy=\"%s\"", + FLAGS_allocator_strategy)); + auto& allocator = cuda_graph_map_[id]; + PADDLE_ENFORCE_EQ( + allocator.get(), nullptr, + platform::errors::InvalidArgument( + "The memory pool of the CUDA Graph with ID %d have been prepared.", + id)); + allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); + VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; } void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { - return m_->RemoveMemoryPoolOfCUDAGraph(id); + auto iter = cuda_graph_map_.find(id); + PADDLE_ENFORCE_NE(iter, cuda_graph_map_.end(), + platform::errors::InvalidArgument( + "Cannot find CUDA Graph with ID = %d", id)); + cuda_graph_map_.erase(iter); + VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id; } #endif #endif diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 1722a06b01f1302c3bb1f98c99af0431ab62f955..9066bb284e28af197111b5d3ea129cc65b5fe914 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -49,6 +49,8 @@ class AllocatorFacade { static AllocatorFacade& Instance(); + AllocatorFacadePrivate* GetPrivate() const; + const std::shared_ptr& GetAllocator(const platform::Place& place); void* GetBasePtr(const std::shared_ptr& allocation); @@ -73,13 +75,14 @@ class AllocatorFacade { size_t size, const phi::Stream& stream); + AllocationPtr Alloc(const platform::Place& place, size_t size, + const phi::Stream& stream); + bool InSameStream(const std::shared_ptr& allocation, const phi::Stream& stream); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed. - AllocationPtr Alloc(const platform::Place& place, size_t size, - const gpuStream_t& stream); uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); void RecordStream(std::shared_ptr allocation, const gpuStream_t& stream); @@ -96,6 +99,10 @@ class AllocatorFacade { private: AllocatorFacade(); AllocatorFacadePrivate* m_; +#ifdef PADDLE_WITH_CUDA + std::unordered_map> + cuda_graph_map_; +#endif }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/custom_allocator.cc b/paddle/fluid/memory/allocation/custom_allocator.cc index bd52c8f4ad270f0f70a23ab39b78bd9363ede769..e53d7b1cc766a3e277ef0a773671ef678bcb3ac7 100644 --- a/paddle/fluid/memory/allocation/custom_allocator.cc +++ b/paddle/fluid/memory/allocation/custom_allocator.cc @@ -32,17 +32,16 @@ void CustomAllocator::FreeImpl(phi::Allocation* allocation) { } phi::Allocation* CustomAllocator::AllocateImpl(size_t size) { - std::call_once(once_flag_, - [this] { platform::DeviceManager::SetDevice(place_); }); + std::call_once(once_flag_, [this] { phi::DeviceManager::SetDevice(place_); }); void* ptr = - platform::DeviceManager::GetDeviceWithPlace(place_)->MemoryAllocate(size); + phi::DeviceManager::GetDeviceWithPlace(place_)->MemoryAllocate(size); if (LIKELY(ptr)) { return new Allocation(ptr, size, place_); } size_t avail, total; - platform::DeviceManager::MemoryStats(place_, &total, &avail); + phi::DeviceManager::MemoryStats(place_, &total, &avail); auto dev_type = platform::PlaceHelper::GetDeviceType(place_); auto dev_id = platform::PlaceHelper::GetDeviceId(place_); diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index ea6d7019be6c1caf4844469276f3113525b33dfc..0bfbe2c6962294fc7e4aa2fff079e9cf411f26f8 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -739,7 +739,7 @@ class BuddyAllocatorList { private: explicit BuddyAllocatorList(const std::string &device_type) : device_type_(device_type) { - auto devices = platform::DeviceManager::GetDeviceList(device_type); + auto devices = phi::DeviceManager::GetDeviceList(device_type); for (auto dev_id : devices) { init_flags_[dev_id].reset(new std::once_flag()); } @@ -766,15 +766,15 @@ class BuddyAllocatorList { device_type_, dev_id)); std::call_once(*init_flags_[dev_id], [this, dev_id] { - platform::DeviceManager::SetDevice(device_type_, dev_id); + phi::DeviceManager::SetDevice(device_type_, dev_id); platform::CustomPlace place(device_type_, dev_id); allocators_[dev_id].reset(new BuddyAllocator( std::unique_ptr( new detail::CustomAllocator(device_type_, dev_id)), - platform::DeviceManager::GetMinChunkSize(place), - platform::DeviceManager::GetMaxChunkSize(place), - platform::DeviceManager::GetExtraPaddingSize(place), device_type_)); + phi::DeviceManager::GetMinChunkSize(place), + phi::DeviceManager::GetMaxChunkSize(place), + phi::DeviceManager::GetExtraPaddingSize(place), device_type_)); }); return allocators_[dev_id].get(); @@ -808,9 +808,9 @@ void *Alloc(const platform::CustomPlace &place, auto *ptr = buddy_allocator->Alloc(size); if (ptr == nullptr) { - platform::DeviceGuard guard(place); + phi::DeviceGuard guard(place); size_t avail, total; - platform::DeviceManager::MemoryStats(place, &total, &avail); + phi::DeviceManager::MemoryStats(place, &total, &avail); PADDLE_THROW(platform::errors::ResourceExhausted( "Cannot allocate %s in %s:%d, avaliable %s, total %s, used " "%s. ", @@ -819,8 +819,7 @@ void *Alloc(const platform::CustomPlace &place, string::HumanReadableSize(total - avail))); } else { if (FLAGS_init_allocated_mem) { - platform::DeviceManager::GetDeviceWithPlace(place)->MemorySet(ptr, 0xEF, - size); + phi::DeviceManager::GetDeviceWithPlace(place)->MemorySet(ptr, 0xEF, size); } } VLOG(10) << " pointer=" << ptr; diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 8627e3e6f8811e162ce3014c01145f331a03ee4b..072c4dee3bc45b4ff5f23f5288d3412a14f63b0f 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -15,56 +15,52 @@ #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h" +#endif + namespace paddle { namespace memory { namespace allocation { StreamSafeCUDAAllocation::StreamSafeCUDAAllocation( - DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream) + DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream, + StreamSafeCUDAAllocator* allocator) : Allocation(underlying_allocation->ptr(), underlying_allocation->base_ptr(), underlying_allocation->size(), underlying_allocation->place()), underlying_allocation_(std::move(underlying_allocation)), - owning_stream_(std::move(owning_stream)) {} + owning_stream_(std::move(owning_stream)), + allocator_(allocator->shared_from_this()) {} void StreamSafeCUDAAllocation::RecordStream(const gpuStream_t& stream) { VLOG(8) << "Try record stream " << stream << " for address " << ptr(); if (stream == owning_stream_) { - VLOG(9) << "Record the same stream of " << stream; return; } std::lock_guard lock_guard(outstanding_event_map_lock_); - gpuEvent_t record_event; - auto it = outstanding_event_map_.find(stream); - if (it == outstanding_event_map_.end()) { - gpuEvent_t new_event; #ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS( - cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - hipEventCreateWithFlags(&new_event, hipEventDisableTiming)); -#endif - outstanding_event_map_[stream] = new_event; - record_event = new_event; - VLOG(9) << "Create a new event " << new_event; - } else { - record_event = it->second; - VLOG(9) << "Reuse event " << record_event; + if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { + graph_capturing_stream_set_.insert(stream); + return; } - -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream)); #endif - VLOG(8) << "Record event " << record_event << " to stream " << stream; + + RecordStreamWithNoGraphCapturing(stream); + RecordGraphCapturingStreams(); } bool StreamSafeCUDAAllocation::CanBeFreed() { - // NOTE(Ruibiao): This function will not execute concurrently, - // so outstanding_event_lock_ is not required here +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { + return graph_capturing_stream_set_.empty() && + outstanding_event_map_.empty(); + } +#endif + + RecordGraphCapturingStreams(); + for (auto it = outstanding_event_map_.begin(); it != outstanding_event_map_.end(); ++it) { gpuEvent_t& event = it->second; @@ -98,21 +94,62 @@ const gpuStream_t& StreamSafeCUDAAllocation::GetOwningStream() const { return owning_stream_; } +void StreamSafeCUDAAllocation::RecordGraphCapturingStreams() { + for (gpuStream_t stream : graph_capturing_stream_set_) { + RecordStreamWithNoGraphCapturing(stream); + } + graph_capturing_stream_set_.clear(); +} + +void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing( + const gpuStream_t& stream) { + gpuEvent_t record_event; + auto it = outstanding_event_map_.find(stream); + if (it == outstanding_event_map_.end()) { + gpuEvent_t new_event; +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS( + cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + hipEventCreateWithFlags(&new_event, hipEventDisableTiming)); +#endif + outstanding_event_map_[stream] = new_event; + record_event = new_event; + VLOG(9) << "Create a new event " << new_event; + } else { + record_event = it->second; + VLOG(9) << "Reuse event " << record_event; + } + +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream)); +#endif + VLOG(8) << "Record event " << record_event << " to stream " << stream; +} + StreamSafeCUDAAllocator::StreamSafeCUDAAllocator( std::shared_ptr underlying_allocator, platform::CUDAPlace place, - gpuStream_t default_stream) + gpuStream_t default_stream, bool in_cuda_graph_capturing) : underlying_allocator_(std::move(underlying_allocator)), place_(std::move(place)), - default_stream_(std::move(default_stream)) { - std::lock_guard lock_guard(allocator_map_lock_); - allocator_map_[place].emplace_back(this); + default_stream_(std::move(default_stream)), + in_cuda_graph_capturing_(in_cuda_graph_capturing) { + if (LIKELY(!in_cuda_graph_capturing)) { + std::lock_guard lock_guard(allocator_map_lock_); + allocator_map_[place].emplace_back(this); + } } StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() { - std::lock_guard lock_guard(allocator_map_lock_); - std::vector& allocators = allocator_map_[place_]; - allocators.erase(std::remove(allocators.begin(), allocators.end(), this), - allocators.end()); + if (LIKELY(!in_cuda_graph_capturing_)) { + std::lock_guard lock_guard(allocator_map_lock_); + std::vector& allocators = allocator_map_[place_]; + allocators.erase(std::remove(allocators.begin(), allocators.end(), this), + allocators.end()); + } } bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; } @@ -140,7 +177,7 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { } StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation( static_unique_ptr_cast(std::move(underlying_allocation)), - default_stream_); + default_stream_, this); VLOG(8) << "Allocate " << allocation->size() << " bytes at address " << allocation->ptr(); return allocation; @@ -157,22 +194,27 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) { "StreamSafeCUDAAllocation*", allocation)); VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr(); - std::lock_guard lock_guard(unfreed_allocation_lock_); if (stream_safe_cuda_allocation->CanBeFreed()) { VLOG(9) << "Directly delete allocation"; delete stream_safe_cuda_allocation; } else { VLOG(9) << "Put into unfreed_allocation list"; + std::lock_guard lock_guard(unfreed_allocation_lock_); unfreed_allocations_.emplace_back(stream_safe_cuda_allocation); } } uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) { + if (UNLIKELY(in_cuda_graph_capturing_)) { + VLOG(7) << "Memory release forbidden in CUDA Graph Captruing"; + return 0; + } + std::lock_guard lock_guard(allocator_map_lock_); std::vector& allocators = allocator_map_[place]; uint64_t released_size = 0; for (StreamSafeCUDAAllocator* allocator : allocators) { - released_size += allocator->ProcessUnfreedAllocationsWithRelease(); + released_size += allocator->ProcessUnfreedAllocationsAndRelease(); } VLOG(8) << "Release " << released_size << " bytes memory from all streams"; return released_size; @@ -191,7 +233,7 @@ void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() { } } -uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsWithRelease() { +uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsAndRelease() { ProcessUnfreedAllocations(); return underlying_allocator_->Release(place_); } diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h index 7354836308cfba0338fb2e146cc14182006876ee..ecddff97c206be968148e32ddf3f9c6623bf8bde 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -14,10 +14,9 @@ #pragma once -#include #include #include -#include +#include #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/spin_lock.h" #include "paddle/fluid/platform/place.h" @@ -32,27 +31,38 @@ namespace paddle { namespace memory { namespace allocation { +class StreamSafeCUDAAllocator; + class StreamSafeCUDAAllocation : public Allocation { public: StreamSafeCUDAAllocation(DecoratedAllocationPtr underlying_allocation, - gpuStream_t owning_stream); + gpuStream_t owning_stream, + StreamSafeCUDAAllocator *allocator); + void RecordStream(const gpuStream_t &stream); bool CanBeFreed(); - const gpuStream_t &GetOwningStream() const; private: + void RecordGraphCapturingStreams(); + void RecordStreamWithNoGraphCapturing(const gpuStream_t &stream); DecoratedAllocationPtr underlying_allocation_; + std::set graph_capturing_stream_set_; std::map outstanding_event_map_; gpuStream_t owning_stream_; SpinLock outstanding_event_map_lock_; + // To compatiable with CUDA Graph, hold the allocator shared_ptr so that + // Allocator will not deconstruct before Allocation + std::shared_ptr allocator_; }; -class StreamSafeCUDAAllocator : public Allocator { +class StreamSafeCUDAAllocator + : public Allocator, + public std::enable_shared_from_this { public: StreamSafeCUDAAllocator(std::shared_ptr underlying_allocator, - platform::CUDAPlace place, - gpuStream_t default_stream); + platform::CUDAPlace place, gpuStream_t default_stream, + bool in_cuda_graph_capturing = false); ~StreamSafeCUDAAllocator(); bool IsAllocThreadSafe() const override; @@ -63,7 +73,7 @@ class StreamSafeCUDAAllocator : public Allocator { private: void ProcessUnfreedAllocations(); - uint64_t ProcessUnfreedAllocationsWithRelease(); + uint64_t ProcessUnfreedAllocationsAndRelease(); static std::map> allocator_map_; @@ -74,6 +84,8 @@ class StreamSafeCUDAAllocator : public Allocator { gpuStream_t default_stream_; std::list unfreed_allocations_; SpinLock unfreed_allocation_lock_; + + bool in_cuda_graph_capturing_; }; } // namespace allocation diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index d7bbfba932cb4a5aab01bc3e2d1276dbe6450b29..076a96139612168f6c3d5d039184ccdb7a536f2e 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -26,6 +26,7 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb); #endif #include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace memory { @@ -43,11 +44,11 @@ BuddyAllocator::BuddyAllocator( #ifdef PADDLE_WITH_CUSTOM_DEVICE if (!dev_type.empty()) { init_allocate_size_func_ = [dev_type]() { - return platform::DeviceManager::GetInitAllocSize( + return phi::DeviceManager::GetInitAllocSize( platform::PlaceHelper::CreatePlace(dev_type)); }; re_allocate_size_func_ = [dev_type]() { - return platform::DeviceManager::GetReallocSize( + return phi::DeviceManager::GetReallocSize( platform::PlaceHelper::CreatePlace(dev_type)); }; } else { diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index a61f98c4e1a22adcc3684a9e5af190a82e3b5110..37ac0b4483291c8c3a3eeb31883c55c7eda24dc8 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -438,7 +438,7 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) { void* p; auto place = platform::CustomPlace(dev_type_, dev_id_); - auto device = platform::DeviceManager::GetDeviceWithPlace(place); + auto device = phi::DeviceManager::GetDeviceWithPlace(place); p = device->MemoryAllocate(size); if (LIKELY(p)) { VLOG(4) << "CustomAllocator::Alloc " << p << " size " << size; @@ -447,7 +447,7 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) { } else { size_t avail, total; - platform::DeviceManager::MemoryStats(place, &total, &avail); + phi::DeviceManager::MemoryStats(place, &total, &avail); PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( "\n\nOut of memory error on %s %d. " "total memory is %s, used memory is %s, " @@ -470,7 +470,7 @@ void CustomAllocator::Free(void* p, size_t size, size_t index) { size, plug_alloc_size)); plug_alloc_size -= size; auto place = platform::CustomPlace(dev_type_, dev_id_); - auto device = platform::DeviceManager::GetDeviceWithPlace(place); + auto device = phi::DeviceManager::GetDeviceWithPlace(place); device->MemoryDeallocate(p, size); } diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index b60bb4fc1d1bb5e4366625277db8fdb968474891..2bca2c388a05958fda0e891190dcf7e7ddc53b0c 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -41,6 +41,11 @@ std::shared_ptr AllocShared(const platform::Place& place, stream); } +AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, + const phi::Stream& stream) { + return allocation::AllocatorFacade::Instance().Alloc(place, size, stream); +} + bool InSameStream(const std::shared_ptr& allocation, const phi::Stream& stream) { return allocation::AllocatorFacade::Instance().InSameStream(allocation, @@ -52,11 +57,6 @@ void* GetBasePtr(const std::shared_ptr& allocation) { } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, - const gpuStream_t& stream) { - return allocation::AllocatorFacade::Instance().Alloc(place, size, stream); -} - uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream) { return allocation::AllocatorFacade::Instance().Release(place, stream); } diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 89b4caa5bed26fa9b8d0bf09df702f17a310dff6..601fe3f2a42c391c602887bacccae97125b951e1 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -41,15 +41,15 @@ extern std::shared_ptr AllocShared(const platform::Place& place, size_t size, const phi::Stream& stream); +extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, + const phi::Stream& stream); + extern bool InSameStream(const std::shared_ptr& allocation, const phi::Stream& stream); extern void* GetBasePtr(const std::shared_ptr& allocation); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, - const gpuStream_t& stream); - extern uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 166cdd0b5d6b6a523cfe470662951184ebbfabc5..3198b4f8d935e3815ba94db945a24ab4df4ca97b 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -44,9 +44,9 @@ void Copy( VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << ", stream=" << stream; - platform::DeviceManager::SetDevice(src_place); - platform::stream::Stream stream_wrapper(src_place, stream); - platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2H( + phi::DeviceManager::SetDevice(src_place); + phi::stream::Stream stream_wrapper(src_place, stream); + phi::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2H( dst, src, num, &stream_wrapper); } @@ -62,9 +62,9 @@ void Copy( VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << ", stream=" << stream; - platform::DeviceManager::SetDevice(dst_place); - platform::stream::Stream stream_wrapper(dst_place, stream); - platform::DeviceManager::GetDeviceWithPlace(dst_place)->MemoryCopyH2D( + phi::DeviceManager::SetDevice(dst_place); + phi::stream::Stream stream_wrapper(dst_place, stream); + phi::DeviceManager::GetDeviceWithPlace(dst_place)->MemoryCopyH2D( dst, src, num, &stream_wrapper); } @@ -82,16 +82,16 @@ void Copy( << dst_place << ", stream=" << stream; if (src_type == dst_type) { - platform::DeviceManager::SetDevice(src_place); - platform::stream::Stream stream_wrapper(src_place, stream); + phi::DeviceManager::SetDevice(src_place); + phi::stream::Stream stream_wrapper(src_place, stream); auto src_id = platform::PlaceHelper::GetDeviceId(src_place); auto dst_id = platform::PlaceHelper::GetDeviceId(dst_place); if (src_id == dst_id) { - platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2D( + phi::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2D( dst, src, num, &stream_wrapper); } else { - platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyP2P( + phi::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyP2P( dst_place, dst, src, num, &stream_wrapper); } } else { diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu index 933717f3090c4b25f912e0bbe87922a1494c128a..5e4a4234bb41663f2287203fa9123029e6894036 100644 --- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu +++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu @@ -12,34 +12,35 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifdef PADDLE_WITH_CUDA -#include -#include -#endif - -#ifdef PADDLE_WITH_HIP -#include -#endif - #include // NOLINT #include #include "gtest/gtest.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" +#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/core/stream.h" +#ifdef PADDLE_WITH_CUDA +#include +#include +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" +#endif + +#ifdef PADDLE_WITH_HIP +#include +#endif + namespace paddle { namespace memory { -__global__ void add_kernel(int *x, int n) { +// y += (x + 1) +__global__ void add_kernel(int *x, int *y, int n) { int thread_num = gridDim.x * blockDim.x; int thread_id = blockIdx.x * blockDim.x + threadIdx.x; for (int i = thread_id; i < n; i += thread_num) { - atomicAdd(x + i, thread_id); + y[i] += x[i] + 1; } } @@ -51,153 +52,6 @@ void CheckMemLeak(const platform::CUDAPlace &place) { << " there may be a memory leak problem"; } -class StreamSafeCUDAAllocTest : public ::testing::Test { - protected: - void SetUp() override { - place_ = platform::CUDAPlace(); - stream_num_ = 64; - grid_num_ = 1; - block_num_ = 32; - data_num_ = 131072; - workspace_size_ = data_num_ * sizeof(int); - - // alloc workspace for each stream - for (size_t i = 0; i < stream_num_; ++i) { - gpuStream_t stream; -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream)); -#endif - - std::shared_ptr allocation = - AllocShared(place_, workspace_size_, - phi::Stream(reinterpret_cast(stream))); -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemset(allocation->ptr(), 0, allocation->size())); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - hipMemset(allocation->ptr(), 0, allocation->size())); -#endif - - streams_.emplace_back(stream); - workspaces_.emplace_back(allocation); - } - - result_ = Alloc(place_, stream_num_ * workspace_size_); - } - - void SingleStreamRun(size_t idx) { - // for all stream i, - // stream idx lauch a kernel to add (j % thread_num) to workspaces_[i][j] - for (size_t i = 0; i < stream_num_; ++i) { - int *x = reinterpret_cast(workspaces_[i]->ptr()); - add_kernel<<>>(x, data_num_); - RecordStream(workspaces_[i], streams_[idx]); - } - } - - void CopyResultAsync() { - for (size_t i = 0; i < stream_num_; ++i) { -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync( - reinterpret_cast(result_->ptr()) + i * data_num_, - workspaces_[i]->ptr(), workspace_size_, cudaMemcpyDeviceToDevice)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync( - reinterpret_cast(result_->ptr()) + i * data_num_, - workspaces_[i]->ptr(), workspace_size_, hipMemcpyDeviceToDevice)); -#endif - } - } - - void MultiStreamRun() { - for (size_t i = 0; i < stream_num_; ++i) { - SingleStreamRun(i); - } - CopyResultAsync(); - workspaces_.clear(); // fast_gc - cudaDeviceSynchronize(); - } - - void MultiThreadMUltiStreamRun() { - std::vector threads; - for (size_t i = 0; i < stream_num_; ++i) { - threads.push_back( - std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i)); - } - for (size_t i = 0; i < stream_num_; ++i) { - threads[i].join(); - } - CopyResultAsync(); - workspaces_.clear(); // fast_gc - cudaDeviceSynchronize(); - } - - void CheckResult() { - auto result_host = std::unique_ptr(new int[result_->size()]); -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(result_host.get(), result_->ptr(), - result_->size(), - cudaMemcpyDeviceToHost)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(result_host.get(), result_->ptr(), - result_->size(), - hipMemcpyDeviceToHost)); -#endif - size_t thread_num = grid_num_ * block_num_; - for (size_t i = 0; i < stream_num_; ++i) { - for (size_t j = 0; j < data_num_; ++j) { - EXPECT_TRUE(result_host[i * stream_num_ + j] == - (j % thread_num) * stream_num_); - } - } - result_.reset(); - } - - void TearDown() override { -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); -#endif - for (gpuStream_t stream : streams_) { - Release(place_, stream); - } - - for (size_t i = 1; i < stream_num_; ++i) { -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i])); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i])); -#endif - } - - CheckMemLeak(place_); - } - - size_t stream_num_; - size_t grid_num_; - size_t block_num_; - size_t data_num_; - size_t workspace_size_; - platform::CUDAPlace place_; - std::vector streams_; - std::vector> workspaces_; - allocation::AllocationPtr result_; -}; - -TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) { - MultiStreamRun(); - CheckResult(); -} - -TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) { - MultiThreadMUltiStreamRun(); - CheckResult(); -} - TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { platform::CUDAPlace place = platform::CUDAPlace(); size_t alloc_size = 256; @@ -214,7 +68,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); allocation::AllocationPtr allocation_unique = - Alloc(place, alloc_size, default_stream); + Alloc(place, alloc_size, + phi::Stream(reinterpret_cast(default_stream))); EXPECT_GE(allocation_unique->size(), alloc_size); EXPECT_EQ(allocation_unique->ptr(), address); allocation_unique.reset(); @@ -303,36 +158,6 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) { CheckMemLeak(place); } -#ifdef PADDLE_WITH_CUDA -TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) { - platform::CUDAPlace place = platform::CUDAPlace(); - size_t alloc_size = 1; - std::shared_ptr allocation = AllocShared(place, alloc_size); - - platform::BeginCUDAGraphCapture(place, cudaStreamCaptureModeGlobal); - EXPECT_THROW(AllocShared(place, alloc_size), paddle::platform::EnforceNotMet); - EXPECT_THROW(Alloc(place, alloc_size), paddle::platform::EnforceNotMet); - EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet); - EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place), - paddle::platform::EnforceNotMet); - EXPECT_THROW( - AllocShared(place, alloc_size, - phi::Stream(reinterpret_cast(nullptr))), - paddle::platform::EnforceNotMet); - EXPECT_THROW(Alloc(place, alloc_size, nullptr), - paddle::platform::EnforceNotMet); - EXPECT_THROW(Release(place, nullptr), paddle::platform::EnforceNotMet); - EXPECT_THROW(RecordStream(allocation, nullptr), - paddle::platform::EnforceNotMet); - EXPECT_THROW(GetStream(allocation), paddle::platform::EnforceNotMet); - platform::EndCUDAGraphCapture(); - - allocation.reset(); - Release(place); - CheckMemLeak(place); -} -#endif - TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { platform::CUDAPlace place = platform::CUDAPlace(); gpuStream_t stream1, stream2; @@ -348,12 +173,14 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { // so the second alloc will fail and retry size_t alloc_size = available_size / 4 * 3; - allocation::AllocationPtr allocation1 = Alloc(place, alloc_size, stream1); + allocation::AllocationPtr allocation1 = Alloc( + place, alloc_size, phi::Stream(reinterpret_cast(stream1))); allocation::AllocationPtr allocation2; std::thread th([&allocation2, &place, &stream2, alloc_size]() { std::this_thread::sleep_for(std::chrono::seconds(1)); - allocation2 = Alloc(place, alloc_size, stream2); + allocation2 = Alloc(place, alloc_size, + phi::Stream(reinterpret_cast(stream2))); }); allocation1.reset(); // free but not release th.join(); @@ -371,5 +198,201 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { CheckMemLeak(place); } +class StreamSafeCUDAAllocTest : public ::testing::Test { + protected: + void SetUp() override { + place_ = platform::CUDAPlace(); + stream_num_ = 64; + grid_num_ = 1; + block_num_ = 32; + data_num_ = 131072; + workspace_size_ = data_num_ * sizeof(int); + + for (size_t i = 0; i < stream_num_; ++i) { + gpuStream_t stream; +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream)); +#endif + + std::shared_ptr workspace_allocation = + AllocShared(place_, workspace_size_, + phi::Stream(reinterpret_cast(stream))); + std::shared_ptr result_allocation = + AllocShared(place_, workspace_size_, + phi::Stream(reinterpret_cast(stream))); + std::shared_ptr host_result_allocation = + AllocShared(platform::CPUPlace(), workspace_size_); + +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemset(workspace_allocation->ptr(), 0, + workspace_allocation->size())); + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMemset(result_allocation->ptr(), 0, result_allocation->size())); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipMemset(workspace_allocation->ptr(), 0, + workspace_allocation->size())); + PADDLE_ENFORCE_GPU_SUCCESS( + hipMemset(result_allocation->ptr(), 0, result_allocation->size())); +#endif + + streams_.emplace_back(stream); + workspaces_.emplace_back(workspace_allocation); + results_.emplace_back(result_allocation); + host_results_.emplace_back(host_result_allocation); + } + } + + void SingleStreamRun(size_t idx) { + int *y = reinterpret_cast(results_[idx]->ptr()); + int neighbouring_idx = idx > 0 ? idx - 1 : idx; + + add_kernel<<>>( + reinterpret_cast(workspaces_[idx]->ptr()), y, data_num_); + add_kernel<<>>( + reinterpret_cast(workspaces_[neighbouring_idx]->ptr()), y, + data_num_); + RecordStream(workspaces_[neighbouring_idx], streams_[idx]); + } + + void MultiStreamRun() { + // Must run in reverse order, or the workspace_[i - 1] will be released + // before streams_[i]'s kernel launch + for (int i = stream_num_ - 1; i >= 0; --i) { + SingleStreamRun(i); + workspaces_[i].reset(); // fast GC + } + } + + void MultiThreadMultiStreamRun() { + std::vector threads; + for (size_t i = 0; i < stream_num_; ++i) { + threads.push_back( + std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i)); + } + for (size_t i = 0; i < stream_num_; ++i) { + threads[i].join(); + } + workspaces_.clear(); + } + + void CUDAGraphRun() { + testing_cuda_graph_ = true; + platform::BeginCUDAGraphCapture(platform::CUDAPlace(), + cudaStreamCaptureModeGlobal); + + std::shared_ptr data_allocation = + AllocShared(platform::CUDAPlace(), workspace_size_); + std::shared_ptr result_allocation = + AllocShared(platform::CUDAPlace(), workspace_size_); + + int *data = static_cast(data_allocation->ptr()); + int *result = static_cast(result_allocation->ptr()); + + gpuStream_t main_stream = GetStream(data_allocation); + gpuStream_t other_stream; + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&other_stream)); + + add_kernel<<>>(data, result, + data_num_); + RecordStream(data_allocation, other_stream); + + std::unique_ptr cuda_graph = + platform::EndCUDAGraphCapture(); + + int replay_times = 10; + for (int i = 0; i < replay_times; ++i) { + cuda_graph->Replay(); + } + + std::shared_ptr host_result_allocation = + AllocShared(platform::CPUPlace(), workspace_size_); + Copy(host_result_allocation->place(), host_result_allocation->ptr(), + result_allocation->place(), result_allocation->ptr(), workspace_size_, + main_stream); + cudaStreamSynchronize(main_stream); + + int *host_result = static_cast(host_result_allocation->ptr()); + for (int i = 0; i < data_num_; ++i) { + EXPECT_EQ(host_result[i], replay_times); + } + + data_allocation.reset(); + result_allocation.reset(); + cuda_graph.release(); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(other_stream)); + } + + void CheckResult() { + for (size_t i = 0; i < stream_num_; ++i) { + Copy(host_results_[i]->place(), host_results_[i]->ptr(), + results_[i]->place(), results_[i]->ptr(), workspace_size_, + streams_[i]); + } + cudaDeviceSynchronize(); + + size_t thread_num = grid_num_ * block_num_; + for (size_t i = 0; i < stream_num_; ++i) { + int *result = static_cast(host_results_[i]->ptr()); + for (size_t j = 0; j < data_num_; ++j) { + EXPECT_EQ(result[j], 2); + } + } + } + + void TearDown() override { + workspaces_.clear(); + results_.clear(); + host_results_.clear(); + for (gpuStream_t stream : streams_) { + Release(place_, stream); + } + + for (size_t i = 0; i < stream_num_; ++i) { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i])); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i])); +#endif + } + + // Memory release for CUDA Graph memory pool is forbidden + if (!testing_cuda_graph_) { + CheckMemLeak(place_); + } + } + + bool testing_cuda_graph_{0}; + size_t stream_num_; + size_t grid_num_; + size_t block_num_; + size_t data_num_; + size_t workspace_size_; + platform::CUDAPlace place_; + std::vector streams_; + std::vector> workspaces_; + std::vector> results_; + std::vector> host_results_; +}; + +TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) { + MultiStreamRun(); + CheckResult(); +} + +TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) { + MultiThreadMultiStreamRun(); + CheckResult(); +} + +#ifdef PADDLE_WITH_CUDA +TEST_F(StreamSafeCUDAAllocTest, CUDAGraphTest) { + MultiStreamRun(); + CUDAGraphRun(); + CheckResult(); +} +#endif + } // namespace memory } // namespace paddle diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 91a0352e1915e95378012aa398ff996cbc10f216..e77be832c0cc8975c3fc2ebb7fad577cdfe919f5 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -161,7 +161,7 @@ cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEP set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function lod_tensor maxouting unpooling pooling lod_rank_table context_project -sequence_pooling segment_pooling executor device_memory_aligment generator) +sequence_pooling executor device_memory_aligment generator) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse matrix_solve) diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc index c28026a4bd43aac5b0c447e24a164e27233076e8..e1460629fb18a4259731c2c9de4ed8f623b5a1e4 100644 --- a/paddle/fluid/operators/abs_op.cc +++ b/paddle/fluid/operators/abs_op.cc @@ -141,8 +141,8 @@ class AbsDoubleGradOp : public framework::OperatorWithKernel { } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc index 0ac29e6d3ada7335cab510ef82c9f46d2da7eb05..b4a97e24cf29233776b19aa0ea7764a00435f6fc 100644 --- a/paddle/fluid/operators/activation_cudnn_op.cu.cc +++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc @@ -132,7 +132,9 @@ struct CudnnReluGradFunctor : public CudnnActivationGradFunctor { explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -146,7 +148,9 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor { : CudnnActivationGradFunctor(ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -159,7 +163,9 @@ struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor { explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -172,7 +178,9 @@ struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor { explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -197,7 +205,8 @@ class CudnnActivationGradKernel public: using T = typename Functor::ELEMENT_TYPE; void Compute(const framework::ExecutionContext& context) const override { - static_assert(Functor::FwdDeps() == kDepOut, "Forward deps must be Out."); + static_assert(Functor::FwdDeps() == ActBwdOpFwdDeps::kDepOut, + "Forward deps must be Out."); const framework::Tensor *X, *Out, *dOut; X = Out = dOut = nullptr; diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 73d65b7c6e7e0a5be2d680afba971d54b492c05d..66f1bcc8b68692abe588b6429b027462eaebde24 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -34,7 +34,8 @@ using paddle::framework::Tensor; template static constexpr bool CanInplaceAct() { - return GradFunctor::FwdDeps() == kDepOut || GradFunctor::FwdDeps() == kNoDeps; + return GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kDepOut || + GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kNoDeps; } #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT) \ @@ -921,7 +922,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepX)) { if (ctx->HasOutput("DX")) { ctx->ShareDim("X", "DX"); ctx->ShareLoD("X", "DX"); @@ -931,7 +933,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel { ctx->ShareLoD("X", "DDOut"); } } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { if (ctx->HasOutput("DOut")) { ctx->ShareDim("Out", "DOut"); ctx->ShareLoD("Out", "DOut"); @@ -960,13 +963,15 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepX)) { if (ctx->HasOutput("DDOut")) { ctx->ShareDim("X", "DDOut"); ctx->ShareLoD("X", "DDOut"); } } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { if (ctx->HasOutput("DDOut")) { ctx->ShareDim("Out", "DDOut"); ctx->ShareLoD("Out", "DDOut"); @@ -987,7 +992,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepX)) { if (ctx->HasOutput("DX")) { ctx->ShareDim("X", "DX"); ctx->ShareLoD("X", "DX"); @@ -997,7 +1003,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel { ctx->ShareLoD("X", "DDOut"); } } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { if (ctx->HasOutput("D_DOut")) { ctx->ShareDim("Out", "D_DOut"); ctx->ShareLoD("Out", "D_DOut"); @@ -1464,6 +1471,18 @@ namespace plat = paddle::platform; FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP); FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL); +REGISTER_ACTIVATION_OP(cos, Cos, CosFunctor, CosGradFunctor) +REGISTER_ACTIVATION_OP(tan, Tan, TanFunctor, TanGradFunctor); +REGISTER_ACTIVATION_OP(acos, Acos, AcosFunctor, AcosGradFunctor); +REGISTER_ACTIVATION_OP(sin, Sin, SinFunctor, SinGradFunctor); +REGISTER_ACTIVATION_OP(asin, Asin, AsinFunctor, AsinGradFunctor); +REGISTER_ACTIVATION_OP(atan, Atan, AtanFunctor, AtanGradFunctor); +REGISTER_ACTIVATION_OP(sinh, Sinh, SinhFunctor, SinhGradFunctor); +REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor); +REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor); +REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor); +REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); + /* ========================== sigmoid register ============================= */ // 1. Register Sigmoid Operator @@ -1584,16 +1603,6 @@ REGISTER_OPERATOR( ops::ActivationOpDoubleGrad2::FwdDeps()>, ops::ActivationDoubleGradOpInplaceInferer); -REGISTER_ACTIVATION_CPU_KERNEL(relu, Relu, ReluCPUFunctor, ReluGradFunctor); - -REGISTER_OP_CPU_KERNEL( - relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>); /* ========================================================================== */ /* ======================== leaky relu register ============================ */ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index ff41da86f7bb6ba8406d58804888b5dcd8bc3be0..4b79397b6cdf2e5c2993f7a72f512cc924c208e7 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -35,16 +35,14 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/phi/kernels/funcs/activation_functor.h" + namespace paddle { namespace operators { using framework::To32BitIndex; -enum ActBwdOpFwdDeps { - kNoDeps = 0x00, // Do not need any forward input/output - kDepX = 0x01, // Only need forward input X - kDepOut = 0x02, // Only need forward output Out -}; +using ActBwdOpFwdDeps = phi::funcs::ActBwdOpFwdDeps; /* The following operator can be used to process SelectedRows, because the * output of those operator for zero is zero too. @@ -89,7 +87,8 @@ inline void ExtractActivationGradTensor( auto x_grad_var = context.OutputVar(framework::GradVarName("X")); const framework::Variable* out_var = nullptr; - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { out_var = context.InputVar("Out"); PADDLE_ENFORCE_NOT_NULL( out_var, platform::errors::NotFound( @@ -139,7 +138,7 @@ inline void ExtractActivationGradTensor( "Output(Out), variable name = %s", context.OutputName(framework::GradVarName("X")))); - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & static_cast(ActBwdOpFwdDeps::kDepX)) { auto x_var = context.InputVar("X"); PADDLE_ENFORCE_NOT_NULL(x_var, platform::errors::NotFound( "Cannot get the tensor from the " @@ -248,6 +247,24 @@ struct SigmoidFunctor : public BaseActivationFunctor { } }; +#define USE_PHI_FUNCTOR(name) \ + template \ + using name##Functor = phi::funcs::name##Functor; \ + template \ + using name##GradFunctor = phi::funcs::name##GradFunctor; + +USE_PHI_FUNCTOR(Cos) +USE_PHI_FUNCTOR(Tan) +USE_PHI_FUNCTOR(Acos) +USE_PHI_FUNCTOR(Sin) +USE_PHI_FUNCTOR(Asin) +USE_PHI_FUNCTOR(Atan) +USE_PHI_FUNCTOR(Sinh) +USE_PHI_FUNCTOR(Cosh) +USE_PHI_FUNCTOR(Asinh) +USE_PHI_FUNCTOR(Acosh) +USE_PHI_FUNCTOR(Atanh) + template struct SigmoidGradFunctor : public BaseActivationFunctor { template { dx.device(d) = dout * out * (static_cast(1) - out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; /* @@ -293,7 +312,9 @@ struct SigmoidGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = (static_cast(1) - out) * out * ddx; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; /* @@ -351,7 +372,9 @@ struct SigmoidTripleGradFunctor : public BaseActivationFunctor { (static_cast(1) - static_cast(2) * out) * dout * d_dOutNew; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // silu(x) = x / (1 + exp(-x)) @@ -376,7 +399,7 @@ struct SiluGradFunctor : public BaseActivationFunctor { (static_cast(1) + (temp2 / temp1))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // Originally: logsigmoid(x) = -log (1 + exp(-x)) @@ -414,7 +437,7 @@ struct LogSigmoidGradFunctor : public BaseActivationFunctor { dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp())); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // exp(x) = e^x @@ -434,7 +457,9 @@ struct ExpGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // expm1(x) = e^x - 1 @@ -454,38 +479,23 @@ struct Expm1GradFunctor : public BaseActivationFunctor { dx.device(d) = dout * out + dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // relu(x) = max(x, 0) -template -struct ReluCPUFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr([] HOSTDEVICE(T v) { - return v > static_cast(0) ? v : static_cast(0); - }); - } -}; template -struct ReluCUDAFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.cwiseMax(static_cast(0)); - } -}; +using ReluCPUFunctor = phi::funcs::ReluCPUFunctor; +template +using ReluGradFunctor = phi::funcs::ReluGradFunctor; template -struct ReluGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * (out > static_cast(0)).template cast(); - } +using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor; - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; +template +using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor; // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) template @@ -504,7 +514,9 @@ struct TanhGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) - out * out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -534,7 +546,9 @@ struct TanhGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = (static_cast(1) - out * out) * ddx; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; /* Out @@ -589,7 +603,9 @@ struct TanhTripleGradFunctor : public BaseActivationFunctor { static_cast(2) * out * dout * d_dOutNew; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // tanhshrink(x) = x - tanh(x) @@ -610,7 +626,7 @@ struct TanhShrinkGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (x.tanh() * x.tanh()); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // tanhshrink(x) = x - tanh(x) @@ -646,7 +662,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (temp1 || temp2).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0 @@ -682,7 +698,7 @@ struct SoftShrinkGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (temp1 + temp2).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // sqrt(x) = x^(1/2) @@ -702,7 +718,9 @@ struct SqrtGradFunctor : public BaseActivationFunctor { dx.device(d) = static_cast(0.5) * dout / out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // rsqrt(x) = x^(-1/2) @@ -722,7 +740,9 @@ struct RsqrtGradFunctor : public BaseActivationFunctor { dx.device(d) = static_cast(-0.5) * dout * out * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // ceil(x) = ceiling(x) @@ -742,7 +762,9 @@ struct ZeroGradFunctor : public BaseActivationFunctor { dx.device(d) = static_cast(0) * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kNoDeps; + } }; // floor(x) = flooring(x) @@ -754,373 +776,6 @@ struct FloorFunctor : public BaseActivationFunctor { } }; -template -struct Sine { - HOSTDEVICE T operator()(const T& val) const { return sin(val); } -}; - -template <> -struct Sine { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(sin(static_cast(val))); - } -}; - -template -struct Cosine { - HOSTDEVICE T operator()(const T& val) const { return cos(val); } -}; - -template <> -struct Cosine { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(cos(static_cast(val))); - } -}; - -// cosine'(x) = -sin(x) -template -struct CosGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = -dout * x.unaryExpr(Sine()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// cosine(x) = cos(x) -template -struct CosFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Cosine()); - } -}; - -// sine'(x) = cos(x) -template -struct SinGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * x.unaryExpr(Cosine()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// sine(x) = sin(x) -template -struct SinFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Sine()); - } -}; - -template -struct Tangent { - HOSTDEVICE T operator()(const T& val) const { return tan(val); } -}; - -template <> -struct Tangent { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(tan(static_cast(val))); - } -}; - -// Tangent'(x) = -Tangent(x) -template -struct TanGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout / x.unaryExpr(Cosine()).square(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// Tangent(x) = tan(x) -template -struct TanFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Tangent()); - } -}; - -template -struct Sinh { - HOSTDEVICE T operator()(const T& val) const { return sinh(val); } -}; - -template <> -struct Sinh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(sinhf(static_cast(val))); - } -}; - -template -struct Cosh { - HOSTDEVICE T operator()(const T& val) const { return cosh(val); } -}; - -template <> -struct Cosh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(coshf(static_cast(val))); - } -}; - -// sinh(x) = sinh(x) -template -struct SinhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Sinh()); - } -}; - -// cosh(x) = cosh(x) -template -struct CoshFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Cosh()); - } -}; - -// sinh'(x) = cosh(x) -template -struct SinhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * x.unaryExpr(Cosh()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// cosh'(x) = sinh(x) -template -struct CoshGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * x.unaryExpr(Sinh()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Acos { - HOSTDEVICE T operator()(const T& val) const { return acos(val); } -}; - -template <> -struct Acos { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(acos(static_cast(val))); - } -}; - -// Acos(x) = acos(x) -template -struct AcosFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Acos()); - } -}; - -// acos'(x) = -1/sqrt(1-x^2) -template -struct AcosGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - -dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Asin { - HOSTDEVICE T operator()(const T& val) const { return asin(val); } -}; - -template <> -struct Asin { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(asin(static_cast(val))); - } -}; - -// Asin(x) = asin(x) -template -struct AsinFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Asin()); - } -}; - -// asin'(x) = 1/sqrt(1-x^2) -template -struct AsinGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Atan { - HOSTDEVICE T operator()(const T& val) const { return atan(val); } -}; - -template <> -struct Atan { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(atan(static_cast(val))); - } -}; - -// Atan(x) = atan(x) -template -struct AtanFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Atan()); - } -}; - -// atan'(x) = 1 / (1 + x^2) -template -struct AtanGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * static_cast(1) / (static_cast(1) + x.square()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Acosh { - HOSTDEVICE T operator()(const T& val) const { return acosh(val); } -}; - -template <> -struct Acosh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(acosh(static_cast(val))); - } -}; - -// Acosh(x) = acosh(x) -template -struct AcoshFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Acosh()); - } -}; - -// acosh'(x) = 1/sqrt(x^2 - 1) -template -struct AcoshGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - dout * static_cast(1) / (x * x - static_cast(1)).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Asinh { - HOSTDEVICE T operator()(const T& val) const { return asinh(val); } -}; - -template <> -struct Asinh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(asinh(static_cast(val))); - } -}; - -// Asinh(x) = asinh(x) -template -struct AsinhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Asinh()); - } -}; - -// asinh'(x) = 1/sqrt(x^2 + 1) -template -struct AsinhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - dout * static_cast(1) / (x.square() + static_cast(1)).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Atanh { - HOSTDEVICE T operator()(const T& val) const { return atanh(val); } -}; - -template <> -struct Atanh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(atanh(static_cast(val))); - } -}; - -// Atanh(x) = atanh(x) -template -struct AtanhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Atanh()); - } -}; - -// atanh'(x) = 1/(1 - x^2) -template -struct AtanhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * static_cast(1) / (static_cast(1) - x.square()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - // round(x) = [x] template struct RoundFunctor : public BaseActivationFunctor { @@ -1147,7 +802,9 @@ struct ReciprocalGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(-1) * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // log(x) = natural logarithm of x @@ -1167,7 +824,7 @@ struct LogGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) / x); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // log2(x) = logarithm to the base 2 of the elements of x @@ -1188,7 +845,7 @@ struct Log2GradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(1) / (x * static_cast(log(2))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // log10(x) = logarithm to the base 10 of the elements of x @@ -1209,7 +866,7 @@ struct Log10GradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(1) / (x * static_cast(log(10))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // log1p(x) = natural logarithm of x+1 @@ -1229,7 +886,7 @@ struct Log1pGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) / (x + static_cast(1))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // square(x) = x^2 @@ -1249,7 +906,7 @@ struct SquareGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(2) * x; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1285,7 +942,7 @@ struct BReluGradFunctor : public BaseActivationFunctor { .template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // relu6(x) = min(max(0, x), 6) @@ -1319,7 +976,9 @@ struct Relu6GradFunctor : public BaseActivationFunctor { .template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // HardSwish = min(max(0, x+3), 6) * x / 6 @@ -1364,7 +1023,7 @@ struct HardSwishGradFunctor : public BaseActivationFunctor { static_cast(1) * (static_cast(1) - tmp)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // For numerical stability, using the following formula instead of softplus(x) = @@ -1409,7 +1068,7 @@ struct SoftplusGradFunctor : public BaseActivationFunctor { .select(dout, dout / (static_cast(1) + (-x_beta).exp())); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // mish(x) = x * tanh(softplus(x)) @@ -1449,7 +1108,7 @@ struct MishGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (tsp + x * (static_cast(1) - tsp * tsp) * gsp); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // softsign(x) = x / (1 + |x|) @@ -1472,7 +1131,7 @@ struct SoftsignGradFunctor : public BaseActivationFunctor { dout * (static_cast(1) / (static_cast(1) + x.abs()).square()); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1504,7 +1163,9 @@ struct SoftReluGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) - (-out).exp()) * temp; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1539,7 +1200,7 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (temp1 + temp2).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1573,7 +1234,7 @@ struct ELUGradFunctor : public BaseActivationFunctor { .select(dout, dout * (out + static_cast(alpha))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1592,7 +1253,7 @@ struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor { .select(dout, dout * static_cast(alpha) * x.exp()); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1672,7 +1333,7 @@ struct CELUGradFunctor : public BaseActivationFunctor { dout * (x / static_cast(alpha)).exp() * temp_a_neg * temp_x_neg; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198 @@ -1701,7 +1362,7 @@ struct PowGradFunctor : public BaseActivationFunctor { x.pow(static_cast(factor) - static_cast(1)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1766,7 +1427,7 @@ struct STanhGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * a * b * (static_cast(1) - temp); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1797,7 +1458,7 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (x > th).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1832,7 +1493,9 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor { static_cast(slope); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1865,7 +1528,7 @@ struct SwishGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * ((static_cast(beta) * out) + temp2); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; /* @@ -1902,7 +1565,7 @@ inline void ExtractActivationDoubleGradTensor( "Cannot get the tensor from the Variable Output, variable name = %s", ctx.OutputName("DDX"))); - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & static_cast(ActBwdOpFwdDeps::kDepX)) { auto x_var = ctx.InputVar("X"); PADDLE_ENFORCE_NOT_NULL( x_var, platform::errors::NotFound( @@ -1925,7 +1588,8 @@ inline void ExtractActivationDoubleGradTensor( VLOG(10) << "Inplace activation of Op: " << ctx.Type(); *X = *ddX; } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { auto out_var = ctx.InputVar("Out"); PADDLE_ENFORCE_NOT_NULL( out_var, @@ -2000,28 +1664,7 @@ struct AbsGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * x.sign(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct ReluGradGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* X, - const framework::Tensor* Out, const framework::Tensor* ddX, - framework::Tensor* ddOut, framework::Tensor* dOut, - framework::Tensor* dX) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "ReluGradGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Output", "Out", "ReluGradGrad")); - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ReluGradGrad")); - ddout.device(*d) = ddx * (out > static_cast(0)).template cast(); - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2050,7 +1693,7 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor { .template cast(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2088,7 +1731,7 @@ struct ELUGradGradFunctor : public BaseActivationFunctor { .template cast(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2127,7 +1770,7 @@ struct CELUGradGradFunctor : public BaseActivationFunctor { .template cast(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2156,7 +1799,9 @@ struct SqrtGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * static_cast(0.5) / out; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -2185,7 +1830,9 @@ struct RsqrtGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * static_cast(-0.5) * out * out * out; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -2214,7 +1861,7 @@ struct SquareGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * static_cast(2) * x; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // TODO(dengkaipeng): double gradient calculation for Square/Sqrt need @@ -2840,7 +2487,7 @@ struct LogGradGradFunctor : public BaseActivationFunctor { } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; } // namespace operators @@ -2849,20 +2496,9 @@ struct LogGradGradFunctor : public BaseActivationFunctor { #define FOR_EACH_ACTIVATION_OP(__macro) \ __macro(silu, Silu, SiluFunctor, SiluGradFunctor); \ __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ - __macro(atan, Atan, AtanFunctor, AtanGradFunctor); \ __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ __macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \ - __macro(cos, Cos, CosFunctor, CosGradFunctor); \ - __macro(tan, Tan, TanFunctor, TanGradFunctor); \ - __macro(acos, Acos, AcosFunctor, AcosGradFunctor); \ - __macro(sin, Sin, SinFunctor, SinGradFunctor); \ - __macro(asin, Asin, AsinFunctor, AsinGradFunctor); \ - __macro(sinh, Sinh, SinhFunctor, SinhGradFunctor); \ - __macro(cosh, Cosh, CoshFunctor, CoshGradFunctor); \ - __macro(asinh, Asinh, AsinhFunctor, AsinhGradFunctor); \ - __macro(acosh, Acosh, AcoshFunctor, AcoshGradFunctor); \ - __macro(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); \ __macro(round, Round, RoundFunctor, ZeroGradFunctor); \ __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.kps similarity index 78% rename from paddle/fluid/operators/activation_op.cu rename to paddle/fluid/operators/activation_op.kps index e578ad899e74b7afb6b966d2afa5695be1e6c5c9..92a101451e211f912e5390171654affa3be4e973 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.kps @@ -18,28 +18,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -struct CudaReluFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - - // relu(x) = max(x, 0) - __device__ __forceinline__ T operator()(const T x) const { - return x > zero ? x : zero; - } -}; - -template -struct CudaReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - - // dx = dout * (out > 0) - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return out > zero ? dout : zero; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; - template struct CudaLeakyReluFunctor : public BaseActivationFunctor { T zero = static_cast(0.0f); @@ -69,7 +47,7 @@ struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { return x > zero ? dout : static_cast(alpha) * dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -93,7 +71,9 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor { return dout * out * (one - out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -122,7 +102,7 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor { return static_cast(dout * (temp * (one + x * (one - temp)))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -159,30 +139,7 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor { return static_cast(dout * (temp2 / (exp(-temp1) + temp2))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAtanFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // atan(x) = atan(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(atan(x)); - } -}; - -template -struct CudaAtanGradFunctor : public BaseActivationFunctor { - T one = static_cast(1.0f); - - // dx = dout / (1 + x^2) - __device__ __forceinline__ T operator()(const T dout, const T x) const { - return dout / (one + x * x); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -219,7 +176,7 @@ struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor { return (x >= -l && x <= l) ? zero : dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -262,191 +219,9 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor { return static_cast(0.0f); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; } -}; - -template -struct CudaCosFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // cos(x) = cos(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(cos(x)); - } -}; - -template -struct CudaCosGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * (-sin(x)) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(-dout * sin(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaSinFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // sin(x) = sin(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(sin(x)); - } -}; - -template -struct CudaSinGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * cos(x) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * cos(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaTanFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // tan(x) = tan(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(tan(x)); - } -}; - -template -struct CudaTanGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout / cos(x)^2 - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout / (cos(x) * cos(x))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAsinFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // asin(x) = asin(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(asin(x)); - } -}; - -template -struct CudaAsinGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = dout / sqrt(1 - x^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout / sqrt(one - x * x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAcosFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // acos(x) = acos(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(acos(x)); - } -}; - -template -struct CudaAcosGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = -dout / sqrt(1 - x^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(-dout / sqrt(one - x * x)); + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kNoDeps; } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaCoshFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // cosh(x) = cosh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(cosh(x)); - } -}; - -template -struct CudaCoshGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * sinh(x) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * sinh(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaSinhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // sinh(x) = sinh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(sinh(x)); - } -}; - -template -struct CudaSinhGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * cosh(x) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * cosh(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; template @@ -469,88 +244,11 @@ struct CudaTanhGradFunctor : public BaseActivationFunctor { return dout * (one - out * out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; - -template -struct CudaAcoshFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // Acosh(x) = acosh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(acosh(x)); - } -}; - -template -struct CudaAcoshGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - // dx = dout * 1 / sqrt(x^2 - 1) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * one / sqrt(x * x - one)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAsinhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // Asinh(x) = asinh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(asinh(x)); + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; } }; -template -struct CudaAsinhGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = dout * 1/sqrt(x^2 + 1) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * one / sqrt(x * x + one)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAtanhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // Atanh(x) = atanh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(atanh(x)); - } -}; - -template -struct CudaAtanhGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - // dx = dout * 1/(1- x^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * one / (one - x * x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - template struct CudaReciprocalFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); @@ -566,7 +264,9 @@ struct CudaReciprocalGradFunctor : public BaseActivationFunctor { return -dout * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -587,7 +287,9 @@ struct CudaExpGradFunctor : public BaseActivationFunctor { return dout * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -608,7 +310,9 @@ struct CudaExpm1GradFunctor : public BaseActivationFunctor { return dout * out + dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -629,7 +333,7 @@ struct CudaLogGradFunctor : public BaseActivationFunctor { return dout / x; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -647,7 +351,7 @@ struct CudaSquareGradFunctor : public BaseActivationFunctor { return dout * two * x; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -670,7 +374,9 @@ struct CudaSqrtGradFunctor : public BaseActivationFunctor { return one_half * dout / out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -693,7 +399,9 @@ struct CudaRsqrtGradFunctor : public BaseActivationFunctor { return minus_one_half * dout * out * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -717,7 +425,7 @@ struct CudaLog1pGradFunctor : public BaseActivationFunctor { return dout / (one + x); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -741,7 +449,7 @@ struct CudaLog2GradFunctor : public BaseActivationFunctor { return dout / (x * log_two); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -765,7 +473,7 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor { return dout / (x * log_ten); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -804,7 +512,7 @@ struct CudaBReluGradFunctor : public BaseActivationFunctor { return (x > t_min_cast && x < t_max_cast) ? dout : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -849,7 +557,9 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor { : static_cast(0.0f); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -893,7 +603,7 @@ struct CudaSTanhGradFunctor : public BaseActivationFunctor { return static_cast(dout * a * b * (one - temp * temp)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -939,7 +649,7 @@ struct CudaSoftplusGradFunctor : public BaseActivationFunctor { return x_beta > t ? arg_dout : static_cast(dout / (one + exp(-x_beta))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -962,7 +672,7 @@ struct CudaSoftsignGradFunctor : public BaseActivationFunctor { return dout / (temp * temp); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -996,7 +706,9 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor { return (out > zero && out < t) ? dout : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1022,7 +734,7 @@ struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor { return static_cast(dout * tanh(x) * tanh(x)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1056,7 +768,7 @@ struct CudaHardShrinkGradFunctor : public BaseActivationFunctor { return (x > -t && x < t) ? zero : dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1097,7 +809,9 @@ struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor { return (out > zero && out < one) ? dout * static_cast(slope) : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1141,7 +855,7 @@ struct CudaSwishGradFunctor : public BaseActivationFunctor { return static_cast(dout * (temp2 + temp3)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1190,7 +904,7 @@ struct CudaMishGradFunctor : public BaseActivationFunctor { return static_cast(dout * (tsp + x * (one - tsp * tsp) * gsp)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1222,7 +936,7 @@ struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor { return x > static_cast(threshold) ? dout : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1274,7 +988,7 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor { return dout * (temp1 * temp2 * (two * x + o) / s + one - temp2); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1320,7 +1034,9 @@ struct CudaELUGradFunctor : public BaseActivationFunctor { return static_cast(dout * (out_pos + out_neg * (out + a))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1347,7 +1063,7 @@ struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor { return static_cast(dout * (x_pos + x_neg * (out + a))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1429,7 +1145,7 @@ struct CudaCELUGradFunctor : public BaseActivationFunctor { temp_a_neg * temp_x_pos + exp(x / a) * temp_a_neg * temp_x_neg)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1477,13 +1193,14 @@ class ActivationGradCudaKernel std::vector ins = {d_out}; std::vector outs = {d_x}; - if (static_cast(Functor::FwdDeps()) == static_cast(kDepOut)) { + if (static_cast(Functor::FwdDeps()) == + static_cast(ActBwdOpFwdDeps::kDepOut)) { // Only need forward output Out ins.push_back(out); paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, &outs, functor); } else if (static_cast(Functor::FwdDeps()) == - static_cast(kDepX)) { + static_cast(ActBwdOpFwdDeps::kDepX)) { // Only need forward input X ins.push_back(x); paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, @@ -1509,7 +1226,9 @@ namespace plat = paddle::platform; ops::ActivationCudaKernel>, \ ops::ActivationCudaKernel>); \ + ops::functor>, \ + ops::ActivationCudaKernel>); \ REGISTER_OP_CUDA_KERNEL( \ act_type##_grad, \ ops::ActivationGradCudaKernel>, \ ops::ActivationGradCudaKernel>); + ops::grad_functor>, \ + ops::ActivationGradCudaKernel>); #define REGISTER_ACTIVATION_CUDA_KERNEL_INT(act_type, op_name, functor, \ grad_functor) \ @@ -1531,7 +1252,9 @@ namespace plat = paddle::platform; ops::ActivationCudaKernel>, \ ops::ActivationCudaKernel>); \ + ops::functor>, \ + ops::ActivationCudaKernel>); \ REGISTER_OP_CUDA_KERNEL( \ act_type##_grad, \ ops::ActivationGradCudaKernel>, \ ops::ActivationGradCudaKernel>); + ops::grad_functor>, \ + ops::ActivationGradCudaKernel>); /* ======================== leaky relu register ============================ */ REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, @@ -1594,50 +1319,6 @@ REGISTER_OP_CUDA_KERNEL( ops::CELUGradGradFunctor>); /* ========================================================================== */ -/* =========================== relu register ============================ */ -#ifdef PADDLE_WITH_HIP -REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, CudaReluFunctor, - CudaReluGradFunctor); -REGISTER_OP_CUDA_KERNEL( - relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>); -#else -REGISTER_OP_CUDA_KERNEL( - relu, ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>); -REGISTER_OP_CUDA_KERNEL( - relu_grad, ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>); -REGISTER_OP_CUDA_KERNEL( - relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>); -#endif -/* ========================================================================== */ - /* =========================== sigmoid register ============================ */ REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, @@ -1650,7 +1331,9 @@ REGISTER_OP_CUDA_KERNEL( ops::SigmoidDoubleGradKernel>, ops::SigmoidDoubleGradKernel>); + ops::SigmoidGradGradFunctor>, + ops::SigmoidDoubleGradKernel>); REGISTER_OP_CUDA_KERNEL( sigmoid_triple_grad, @@ -1659,7 +1342,10 @@ REGISTER_OP_CUDA_KERNEL( ops::SigmoidTripleGradKernel>, ops::SigmoidTripleGradKernel>); + ops::SigmoidTripleGradFunctor>, + ops::SigmoidTripleGradKernel< + plat::CUDADeviceContext, + ops::SigmoidTripleGradFunctor>); /* ========================================================================== */ /* =========================== tanh register ============================ */ @@ -1696,7 +1382,9 @@ REGISTER_OP_CUDA_KERNEL( ops::SqrtDoubleGradKernel>, ops::SqrtDoubleGradKernel>); + ops::SqrtGradGradFunctor>, + ops::SqrtDoubleGradKernel>); /* ========================================================================== */ /* =========================== rsqrt register ============================= @@ -1726,6 +1414,8 @@ REGISTER_OP_CUDA_KERNEL( ops::SquareGradGradFunctor>, ops::SquareDoubleGradKernel>, + ops::SquareDoubleGradKernel>, ops::SquareDoubleGradKernel>, ops::SquareDoubleGradKernel>); \ + REGISTER_OP_KERNEL(act_type##_grad, KP, plat::XPUPlace, \ + ops::ActivationGradCudaKernel>); + +REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, + CudaLeakyReluGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, + CudaSigmoidGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(exp, Exp, CudaExpFunctor, CudaExpGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor, + CudaReciprocalGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(softplus, Softplus, CudaSoftplusFunctor, + CudaSoftplusGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, HardSwish, CudaHardSwishFunctor, + CudaHardSwishGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(elu, Elu, CudaELUFunctor, CudaELUGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(celu, Celu, CudaCELUFunctor, + CudaCELUGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(sqrt, Sqrt, CudaSqrtFunctor, + CudaSqrtGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(square, Square, CudaSquareFunctor, + CudaSquareGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(silu, Silu, CudaSiluFunctor, + CudaSiluGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, + CudaLogSigmoidGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor, + CudaSoftShrinkGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(ceil, Ceil, CudaCeilFunctor, + CudaZeroGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(floor, Floor, CudaFloorFunctor, + CudaZeroGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(log1p, Log1p, CudaLog1pFunctor, + CudaLog1pGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(brelu, BRelu, CudaBReluFunctor, + CudaBReluGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(soft_relu, SoftRelu, CudaSoftReluFunctor, + CudaSoftReluGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(softsign, Softsign, CudaSoftsignFunctor, + CudaSoftsignGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(relu6, Relu6, CudaRelu6Functor, + CudaRelu6GradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor, + CudaHardShrinkGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(hard_sigmoid, HardSigmoid, + CudaHardSigmoidFunctor, + CudaHardSigmoidGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(swish, Swish, CudaSwishFunctor, + CudaSwishGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(thresholded_relu, ThresholdedRelu, + CudaThresholdedReluFunctor, + CudaThresholdedReluGradFunctor); + +#endif // PADDLE_WITH_XPU_KP diff --git a/paddle/fluid/operators/addmm_op.cc b/paddle/fluid/operators/addmm_op.cc index de4d7818020dd586547ff9eedb53108285048c09..716a2e40179e404c2afcec31fb72cde7172f7e54 100644 --- a/paddle/fluid/operators/addmm_op.cc +++ b/paddle/fluid/operators/addmm_op.cc @@ -147,8 +147,8 @@ class AddMMOpGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(addmm, AddmmInferShapeFunctor, - PT_INFER_META(phi::AddmmInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(addmm, AddmmInferShapeFunctor, + PD_INFER_META(phi::AddmmInferMeta)); REGISTER_OPERATOR(addmm, ops::AddMMOp, ops::AddMMOpMaker, ops::AddMMOpGradMaker, ops::AddMMOpGradMaker, diff --git a/paddle/fluid/operators/amp/fp16_type_traits.h b/paddle/fluid/operators/amp/fp16_type_traits.h index f7aa0de97598df67817d81c1d1c1a5e8356f42ea..56aebe90788fbaa6c300ee9ac620c3d7613ff141 100644 --- a/paddle/fluid/operators/amp/fp16_type_traits.h +++ b/paddle/fluid/operators/amp/fp16_type_traits.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/float16.h" namespace paddle { @@ -32,6 +33,12 @@ class MPTypeTrait { using Type = float; }; +template <> +class MPTypeTrait { + public: + using Type = float; +}; + } // namespace details } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/angle_op.h b/paddle/fluid/operators/angle_op.h index db5a3ea2961948a241c0424c77eecb0d77183e48..116a8053db3edb724d2c68b93d92ce958fbe8e32 100644 --- a/paddle/fluid/operators/angle_op.h +++ b/paddle/fluid/operators/angle_op.h @@ -36,8 +36,8 @@ class AngleKernel : public framework::OpKernel { auto numel = x->numel(); auto* x_data = x->data(); - auto* out_data = out->mutable_data>( - context.GetPlace(), size_t(x->numel() * sizeof(phi::funcs::Real))); + auto* out_data = out->mutable_data>( + context.GetPlace(), size_t(x->numel() * sizeof(phi::dtype::Real))); auto& dev_ctx = context.template device_context(); platform::ForRange for_range(dev_ctx, numel); @@ -57,7 +57,7 @@ class AngleGradKernel : public framework::OpKernel { ctx.Output(framework::GradVarName("X")); auto numel = d_out->numel(); - auto* dout_data = d_out->data>(); + auto* dout_data = d_out->data>(); auto* x_data = x->data(); auto* dx_data = d_x->mutable_data( ctx.GetPlace(), static_cast(numel * sizeof(T))); diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc index 0f5c048b6be9c73ae98181685269592f409196cd..c5e4188ca2d6f749a06127c41da99490a7fb3ffc 100644 --- a/paddle/fluid/operators/arg_max_op.cc +++ b/paddle/fluid/operators/arg_max_op.cc @@ -15,23 +15,19 @@ limitations under the License. */ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/arg_min_max_op_base.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +DECLARE_INFER_SHAPE_FUNCTOR(arg_max, ArgMaxInferShapeFunctor, + PD_INFER_META(phi::ArgMinMaxInferMeta)); + REGISTER_OPERATOR( arg_max, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMaxOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -REGISTER_OP_CPU_KERNEL( - arg_max, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel); + paddle::framework::EmptyGradOpMaker, + ArgMaxInferShapeFunctor); + REGISTER_OP_VERSION(arg_max) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/arg_min_max_op_base.cu.h b/paddle/fluid/operators/arg_min_max_op_base.cu.h deleted file mode 100644 index b77031f7fb4c9d94f30ed06333b9c8766fd2310d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/arg_min_max_op_base.cu.h +++ /dev/null @@ -1,202 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#if defined(__NVCC__) || defined(__HIPCC__) - -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include -#include -#include -#include -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/phi/core/ddim.h" - -namespace paddle { -namespace operators { - -namespace { // NOLINT -template -using KeyValuePair = cub::KeyValuePair; -using Tensor = framework::Tensor; - -} // end namespace - -#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \ - case (1 << (log2_block_dim)): { \ - constexpr auto kBlockDim = (1 << (log2_block_dim)); \ - __VA_ARGS__; \ - } break - -#define FIXED_BLOCK_DIM_CASE(...) \ - FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); - -template -__global__ void ArgCUDAKernel(const int64_t height, // n * h - const int64_t width, // c - const int64_t post_size, // h - const Reducer reducer, const T init, const T* in, - IndType* out) { - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - - for (int idx = blockIdx.x; idx < height; idx += gridDim.x) { - KeyValuePair kv_pair = {-1, init}; - int h = idx / post_size; - int w = idx % post_size; - for (int k = threadIdx.x; k < width; k += blockDim.x) { - kv_pair = - reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair); - } - kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer); - if (threadIdx.x == 0) { - out[idx] = static_cast(kv_pair.key); - } - __syncthreads(); - } -} - -template -void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input, - Tensor* indices, const int64_t pre, const int64_t post, - const int64_t n) { - auto cu_stream = ctx.stream(); - auto ComputeBlockSize = [](int64_t col) { - auto block_size = 8; - if (col > 512) - block_size = 1024; - else if (col > 256) - block_size = 512; - else if (col > 128) - block_size = 256; - else if (col > 64) - block_size = 128; - else if (col > 32) - block_size = 64; - else if (col > 16) - block_size = 32; - else if (col > 8) - block_size = 16; -#ifdef __HIPCC__ - block_size = std::min(block_size, 256); -#endif - return block_size; - }; - - int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; - int64_t height = pre * post; - int64_t width = n; - int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx; - - const T* in_data = input.data(); - IndType* out_data = indices->mutable_data(ctx.GetPlace()); - - if (typeid(Reducer) == typeid(cub::ArgMax)) { - switch (ComputeBlockSize(width)) { - FIXED_BLOCK_DIM_CASE( - ArgCUDAKernel<<>>( - height, width, post, Reducer(), std::numeric_limits::lowest(), - in_data, out_data)); - } - } else { - switch (ComputeBlockSize(width)) { - FIXED_BLOCK_DIM_CASE( - ArgCUDAKernel<<>>( - height, width, post, Reducer(), std::numeric_limits::max(), - in_data, out_data)); - } - } -} - -template -struct VisitDataCudaArgMinMaxFunctor { - const framework::ExecutionContext& ctx; - - explicit VisitDataCudaArgMinMaxFunctor(const framework::ExecutionContext& ctx) - : ctx(ctx) {} - template - void apply() const { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - int axis = ctx.Attr("axis"); - const bool& flatten = ctx.Attr("flatten"); - - framework::DDim input_dims; - if (flatten) { - input_dims = phi::make_ddim({input->numel()}); - // if flatten, the axis just as 0 - axis = 0; - } else { - input_dims = input->dims(); - if (axis < 0) axis += input->dims().size(); - } - - int64_t numel = input->numel(); - int64_t groups = numel / input_dims[axis]; - int64_t pre = 1; - int64_t post = 1; - int64_t n = input_dims[axis]; - - for (int i = 0; i < axis; i++) { - pre *= input_dims[i]; - } - - for (int i = axis + 1; i < input_dims.size(); i++) { - post *= input_dims[i]; - } - - const auto& dev_ctx = ctx.cuda_device_context(); - ComputeFullArg(dev_ctx, *input, output, pre, post, n); - } -}; -template -class ArgMinMaxOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dtype = ctx.Attr("dtype"); - if (dtype < 0) { - framework::VisitDataTypeTiny( - static_cast( - framework::proto::VarType::INT64), - VisitDataCudaArgMinMaxFunctor(ctx)); - return; - } - framework::VisitDataTypeTiny( - static_cast(dtype), - VisitDataCudaArgMinMaxFunctor(ctx)); - } -}; - -#endif - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h index d3ce61d183a3d322e40966ce59f9a10320ceab4f..585341beea12c14fbd01a3a47af34ce57def0db5 100644 --- a/paddle/fluid/operators/arg_min_max_op_base.h +++ b/paddle/fluid/operators/arg_min_max_op_base.h @@ -27,193 +27,9 @@ limitations under the License. */ namespace paddle { namespace operators { -enum ArgMinMaxType { kArgMin, kArgMax }; - -template -struct ArgMinMaxFunctor {}; - -#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value) \ - template \ - struct ArgMinMaxFunctor { \ - void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \ - framework::LoDTensor* out, framework::DDim x_dims, \ - int64_t axis, bool keepdims) { \ - auto in_eigen = framework::EigenTensor::From(in, x_dims); \ - if (keepdims) { \ - auto out_eigen = framework::EigenTensor::From(*out); \ - out_eigen.device(*(ctx.eigen_device())) = \ - in_eigen.eigen_op_type(axis).template cast(); \ - } else { \ - auto out_eigen = framework::EigenTensor::From(*out); \ - out_eigen.device(*(ctx.eigen_device())) = \ - in_eigen.eigen_op_type(axis).template cast(); \ - } \ - } \ - } - -DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin); -DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax); - -template -struct VisitDataArgMinMaxFunctor { - const framework::ExecutionContext& ctx; - - explicit VisitDataArgMinMaxFunctor(const framework::ExecutionContext& ctx) - : ctx(ctx) {} - template - void apply() const { - auto& x = *(ctx.Input("X")); - auto& out = *(ctx.Output("Out")); - out.template mutable_data(ctx.GetPlace()); - auto axis = ctx.Attr("axis"); - auto keepdims = ctx.Attr("keepdims"); - const bool& flatten = ctx.Attr("flatten"); - // paddle do not have the scalar tensor, just return the shape [1] tensor - if (flatten) keepdims = true; - - // if flatten, will construct the new dims for the cacluate - framework::DDim x_dims; - if (flatten) { - x_dims = phi::make_ddim({x.numel()}); - // if flatten, the axis just as 0 - axis = 0; - } else { - x_dims = x.dims(); - if (axis < 0) axis += x_dims.size(); - } - auto& dev_ctx = ctx.template device_context(); - -#define CALL_ARG_MINMAX_FUNCTOR(rank) \ - ArgMinMaxFunctor \ - functor##rank; \ - functor##rank(dev_ctx, x, &out, x_dims, axis, keepdims) - - switch (x_dims.size()) { - case 1: - CALL_ARG_MINMAX_FUNCTOR(1); - break; - case 2: - CALL_ARG_MINMAX_FUNCTOR(2); - break; - case 3: - CALL_ARG_MINMAX_FUNCTOR(3); - break; - case 4: - CALL_ARG_MINMAX_FUNCTOR(4); - break; - case 5: - CALL_ARG_MINMAX_FUNCTOR(5); - break; - case 6: - CALL_ARG_MINMAX_FUNCTOR(6); - break; - default: - PADDLE_ENFORCE_LE( - x_dims.size(), 6, - platform::errors::InvalidArgument( - "%s operator doesn't supports tensors whose ranks are greater " - "than 6.", - (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax"))); - break; -#undef CALL_ARG_MINMAX_FUNCTOR - } - } -}; - -template -class ArgMinMaxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dtype = ctx.Attr("dtype"); - if (dtype < 0) { - framework::VisitDataTypeTiny( - static_cast( - framework::proto::VarType::INT64), - VisitDataArgMinMaxFunctor(ctx)); - return; - } - framework::VisitDataTypeTiny( - static_cast(dtype), - VisitDataArgMinMaxFunctor(ctx)); - } -}; - -template -using ArgMinKernel = ArgMinMaxKernel; - -template -using ArgMaxKernel = ArgMinMaxKernel; - class ArgMinMaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "arg_min_max"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "arg_min_max"); - const auto& x_dims = ctx->GetInputDim("X"); - int64_t axis = ctx->Attrs().Get("axis"); - bool keepdims = ctx->Attrs().Get("keepdims"); - const bool& flatten = ctx->Attrs().Get("flatten"); - - PADDLE_ENFORCE_GE(axis, -x_dims.size(), - platform::errors::InvalidArgument( - "'axis'(%d) must be greater than or equal to" - " -Rank(X)(%d).", - axis, -x_dims.size())); - PADDLE_ENFORCE_LT( - axis, x_dims.size(), - platform::errors::InvalidArgument( - "'axis'(%d) must be less than Rank(X)(%d) of Input(X).", axis, - x_dims.size())); - - const int& dtype = ctx->Attrs().Get("dtype"); - PADDLE_ENFORCE_EQ( - (dtype < 0 || dtype == 2 || dtype == 3), true, - platform::errors::InvalidArgument( - "The attribute of dtype in argmin/argmax must be [%s] or [%s], but " - "received [%s]", - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64), - paddle::framework::DataTypeToString( - static_cast(dtype)))); - - auto x_rank = x_dims.size(); - if (axis < 0) axis += x_rank; - if (ctx->IsRuntime()) { - if (dtype == framework::proto::VarType::INT32) { - int64_t all_element_num = 0; - if (flatten) { - all_element_num = phi::product(x_dims); - - } else { - all_element_num = x_dims[axis]; - } - PADDLE_ENFORCE_LE( - all_element_num, INT_MAX, - platform::errors::InvalidArgument( - "The element num of the argmin/argmax input at axis is " - "%d, is larger than int32 maximum value:%d, you must " - "set the dtype of argmin/argmax to 'int64'.", - all_element_num, INT_MAX)); - } - } - std::vector vec; - if (flatten) { - vec.emplace_back(static_cast(1)); - } else { - for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]); - if (keepdims) { - vec.emplace_back(static_cast(1)); - } - for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]); - } - ctx->SetOutputDim("Out", phi::make_ddim(vec)); - } }; class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc index 0a4ba6fb0bfdfccfc4eae99da730e96fe5f0a540..fb3abd01af8c396d764f9f1d247f24c41bd15959 100644 --- a/paddle/fluid/operators/arg_min_op.cc +++ b/paddle/fluid/operators/arg_min_op.cc @@ -12,26 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/arg_min_max_op_base.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +DECLARE_INFER_SHAPE_FUNCTOR(arg_min, ArgMinInferShapeFunctor, + PD_INFER_META(phi::ArgMinMaxInferMeta)); REGISTER_OPERATOR( arg_min, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMinOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + ArgMinInferShapeFunctor); -REGISTER_OP_CPU_KERNEL( - arg_min, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel); REGISTER_OP_VERSION(arg_min) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu deleted file mode 100644 index 23170bf0087906d752767051ce58874cb3584ee5..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/arg_min_op.cu +++ /dev/null @@ -1,21 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/arg_min_max_op_base.cu.h" -REGISTER_OP_CUDA_KERNEL( - arg_min, paddle::operators::ArgMinMaxOpCUDAKernel, - paddle::operators::ArgMinMaxOpCUDAKernel, - paddle::operators::ArgMinMaxOpCUDAKernel, - paddle::operators::ArgMinMaxOpCUDAKernel, - paddle::operators::ArgMinMaxOpCUDAKernel); diff --git a/paddle/fluid/operators/argsort_op.cc b/paddle/fluid/operators/argsort_op.cc index 9e525c20335d37242d0e239e81d2d2976b92a6b4..1a8aca777370bc140e39b7457702557042541744 100644 --- a/paddle/fluid/operators/argsort_op.cc +++ b/paddle/fluid/operators/argsort_op.cc @@ -12,40 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/argsort_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { class ArgsortOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "argsort"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "argsort"); - OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "argsort"); - - auto in_dims = ctx->GetInputDim("X"); - int axis = ctx->Attrs().Get("axis"); - - auto num_dims = in_dims.size(); - PADDLE_ENFORCE_GE(axis, -num_dims, - platform::errors::InvalidArgument( - "'axis'(%d) must be greater than or equal to" - " -num_dims(%d).", - axis, -num_dims)); - PADDLE_ENFORCE_LT( - axis, num_dims, - platform::errors::InvalidArgument( - "'axis'(%d) must be less than num_dims(%d).", axis, num_dims)); - - ctx->ShareDim("X", "Out"); - ctx->ShareDim("X", "Indices"); - ctx->ShareLoD("X", "Out"); - ctx->ShareLoD("X", "Indices"); - } }; class ArgsortGradOp : public framework::OperatorWithKernel { @@ -122,18 +101,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ArgsortGradNoNeedBufferVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(argsort, ArgsortInferShapeFunctor, + PD_INFER_META(phi::ArgsortInferMeta)); REGISTER_OPERATOR(argsort, ops::ArgsortOp, ops::ArgsortOpMaker, ops::ArgsortGradOpMaker, - ops::ArgsortGradOpMaker); + ops::ArgsortGradOpMaker, + ArgsortInferShapeFunctor); REGISTER_OPERATOR(argsort_grad, ops::ArgsortGradOp, ops::ArgsortGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL(argsort, - ops::ArgsortKernel, - ops::ArgsortKernel, - ops::ArgsortKernel, - ops::ArgsortKernel); -REGISTER_OP_CPU_KERNEL( - argsort_grad, ops::ArgsortGradientKernel, - ops::ArgsortGradientKernel, - ops::ArgsortGradientKernel, - ops::ArgsortGradientKernel); diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu deleted file mode 100644 index 8b7a0b3eadb16bbe0822809748e343dc0d793a0f..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/argsort_op.cu +++ /dev/null @@ -1,430 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/argsort_op.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -#ifdef __HIPCC__ -namespace rocprim { -namespace detail { -template <> -struct radix_key_codec_base - : radix_key_codec_integral {}; -} // namespace detail -} // namespace rocprim -#else -// set cub base traits in order to handle float16 -namespace cub { -template <> -struct NumericTraits - : BaseTraits {}; -} // namespace cub -#endif - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -// Iter for move to next row -struct SegmentOffsetIter { - EIGEN_DEVICE_FUNC - explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const { - return idx * num_cols_; - } - - int num_cols_; -}; - -template -static __global__ void FillIndex(T* indices, T num_rows, T num_cols) { - int col_id = threadIdx.x; - int row_id = blockIdx.x; - - for (T j = row_id; j < num_rows; j += gridDim.x) { - for (T i = col_id; i < num_cols; i += blockDim.x) { - indices[j * num_cols + i] = i; - } - } -} - -template -static __global__ void FillFlattenGrad(const T* dO, const IndType* indices, - int64_t size, T* dX) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - for (int i = index; i < size; i += stride) { - dX[indices[i]] = dO[i]; - } -} - -template -static __global__ void FillGrad(const T* dO, const IndType* indices, T* dX, - IndType num_rows, IndType num_cols) { - int col_id = threadIdx.x; - int row_id = blockIdx.x; - - for (IndType j = row_id; j < num_rows; j += gridDim.x) { - for (IndType i = col_id; i < num_cols; i += blockDim.x) { - dX[j * num_cols + indices[j * num_cols + i]] = dO[j * num_cols + i]; - } - } -} - -// Sort by flag descending, True: descending. False: Ascending. -// Default is false. -template -void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input, - Tensor* output, Tensor* indices, const IndType num_rows, - const IndType num_cols, const bool descending) { - auto cu_stream = ctx.stream(); - - Tensor input_indices; - - const std::vector dims = {num_rows, num_cols}; - auto dim = phi::make_ddim(dims); - input_indices.Resize(dim); - input_indices.mutable_data(ctx.GetPlace()); - - size_t temp_storage_bytes = -1; - - auto ComputeBlockSize = [](IndType col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; - }; - - int block_size = ComputeBlockSize(num_cols); - - int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - // actually, int num_rows < max_grid_size - int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX; - // Init a index array - FillIndex<<>>( - input_indices.data(), num_rows, num_cols); - - T* sorted_out_ptr; - IndType* sorted_indices_ptr; - - const T* inp = input->data(); - T* out = output->mutable_data(ctx.GetPlace()); - IndType* ind = indices->mutable_data(ctx.GetPlace()); - - sorted_out_ptr = out; - sorted_indices_ptr = ind; - - // create iter for counting input - cub::CountingInputIterator counting_iter(0); - // segment_offset is used for move to next row - cub::TransformInputIterator> - segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols)); - - gpuError_t err; - if (descending) { - err = cub::DeviceSegmentedRadixSort::SortPairsDescending( - nullptr, temp_storage_bytes, inp, sorted_out_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - } else { - err = cub::DeviceSegmentedRadixSort::SortPairs( - nullptr, temp_storage_bytes, inp, sorted_out_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - } - PADDLE_ENFORCE_GPU_SUCCESS(err); - - Tensor temp_storage; - temp_storage.mutable_data(ctx.GetPlace(), temp_storage_bytes); - - if (descending) { - err = cub::DeviceSegmentedRadixSort::SortPairsDescending( - temp_storage.data(), temp_storage_bytes, inp, sorted_out_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - } else { - err = cub::DeviceSegmentedRadixSort::SortPairs( - temp_storage.data(), temp_storage_bytes, inp, sorted_out_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - } - - PADDLE_ENFORCE_GPU_SUCCESS(err); -} - -template -void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO, - const Tensor* indices, Tensor* dX, const IndType num_rows, - const IndType num_cols) { - auto cu_stream = ctx.stream(); - - auto ComputeBlockSize = [](IndType col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; - }; - - int block_size = ComputeBlockSize(num_cols); - - int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - // actually, int num_rows < max_grid_size - int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX; - FillGrad<<>>( - dO->data(), indices->data(), dX->data(), num_rows, - num_cols); -} - -template -void ArgFlattenAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO, - const Tensor* indices, int64_t size, Tensor* dX) { - auto cu_stream = ctx.stream(); - - const int64_t block_size = - std::min(size, static_cast(ctx.GetMaxThreadsPerBlock())); - int64_t max_threads = ctx.GetMaxPhysicalThreadCount(); - const int64_t max_blocks = - std::max(((max_threads - 1) / block_size + 1), static_cast(1)); - const int64_t grid_size = - std::min(max_blocks, (size + block_size - 1) / block_size); - - FillFlattenGrad<<>>( - dO->data(), indices->data(), size, dX->data()); -} - -template -class ArgsortOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int axis = ctx.Attr("axis"); - bool descending = ctx.Attr("descending"); - - auto in_dims = input->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - const T* in_data = input->data(); - auto size = input->numel(); - T* out_data = output->mutable_data(ctx.GetPlace()); - int64_t* ids_data = indices->mutable_data(ctx.GetPlace()); - - // Use thrust for parallel acceleration when the input size is equal to the - // length of the ‘axis’ dimension. - // Compared to the following 'Special case for full sort', ascending sort is - // 34 times faster and descending sort is 31 times faster. - if (size == in_dims[axis]) { - thrust::sequence(thrust::device, ids_data, ids_data + size); - thrust::copy(thrust::device, in_data, in_data + size, out_data); - thrust::sort_by_key(thrust::device, out_data, out_data + size, ids_data); - if (descending) { - thrust::reverse(thrust::device, out_data, out_data + size); - thrust::reverse(thrust::device, ids_data, ids_data + size); - } - return; - } - - // Special case for full sort, speedup ~190x. - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - const auto& dev_ctx = ctx.cuda_device_context(); - ArgFullSort(dev_ctx, input, output, indices, input_height, - input_width, descending); - } else { - // if not full sort, do transpose first - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); - } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (int i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - - Tensor trans_inp; - T* trans_inp_data = trans_inp.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - const auto& dev_ctx = ctx.cuda_device_context(); - // Do transpose - TransCompute(ndims, dev_ctx, *input, - &trans_inp, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - T* out_data = output->mutable_data(ctx.GetPlace()); - - Tensor tmp_indices; - // temp indices for sorting - tmp_indices.mutable_data(trans_dims, ctx.GetPlace()); - indices->mutable_data(ctx.GetPlace()); - - ArgFullSort(dev_ctx, &trans_inp, &tmp_out, &tmp_indices, - input_height, input_width, descending); - - TransCompute( - ndims, dev_ctx, tmp_indices, indices, trans); - // transpose back - TransCompute(ndims, dev_ctx, tmp_out, - output, trans); - return; - } - } -}; - -template -class ArgsortGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* indices = ctx.Input("Indices"); - auto* dX = ctx.Output(framework::GradVarName("X")); - auto* dO = ctx.Input(framework::GradVarName("Out")); - int axis = ctx.Attr("axis"); - - dX->mutable_data(ctx.GetPlace()); - if (dO->numel() == 0) return; - - auto in_dims = dX->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - int64_t size = dX->numel(); - const auto& dev_ctx = ctx.cuda_device_context(); - - // Parallel acceleration when the input size is equal to the length of the - // ‘axis’ dimension. - // Compared to 'special case for full sort' below, the gradient calculation - // is 10 times faster. - if (size == in_dims[axis]) { - ArgFlattenAssign(dev_ctx, dO, indices, size, dX); - return; - } - - // Special case for full sort, speedup ~190x. - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - ArgFullAssign(dev_ctx, dO, indices, dX, input_height, - input_width); - } else { - // if not full sort, do transpose first - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); - } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (int i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - - Tensor trans_dO; - trans_dO.mutable_data(trans_dims, ctx.GetPlace()); - Tensor trans_ind; - trans_ind.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - // Do transpose - TransCompute(ndims, dev_ctx, *dO, - &trans_dO, trans); - TransCompute( - ndims, dev_ctx, *indices, &trans_ind, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - - ArgFullAssign(dev_ctx, &trans_dO, &trans_ind, &tmp_out, - input_height, input_width); - - // transpose back - TransCompute(ndims, dev_ctx, tmp_out, dX, - trans); - return; - } - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL( - argsort, - paddle::operators::ArgsortOpCUDAKernel, - paddle::operators::ArgsortOpCUDAKernel, - paddle::operators::ArgsortOpCUDAKernel, - paddle::operators::ArgsortOpCUDAKernel, - paddle::operators::ArgsortOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - argsort_grad, paddle::operators::ArgsortGradOpCUDAKernel, - paddle::operators::ArgsortGradOpCUDAKernel, - paddle::operators::ArgsortGradOpCUDAKernel, - paddle::operators::ArgsortGradOpCUDAKernel, - paddle::operators::ArgsortGradOpCUDAKernel); diff --git a/paddle/fluid/operators/argsort_op.h b/paddle/fluid/operators/argsort_op.h deleted file mode 100644 index d850e51a4bf061d3e5fc46bd53a2ef56610d6de9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/argsort_op.h +++ /dev/null @@ -1,243 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/transpose_op.h" - -namespace paddle { -namespace operators { - -template -using EigenMatrix = framework::EigenMatrix; - -template -using EigenVector = framework::EigenVector; - -using Tensor = framework::Tensor; - -template -static void FullSort(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, T* t_out, Type* t_indices, - bool descending) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - std::vector> col_vec; - col_vec.reserve(input_width); - if (input_dim == 1) { - auto e_input = EigenVector::Flatten(*input); - for (Type j = 0; j < input_width; ++j) { - col_vec.push_back(std::pair(e_input(j), j)); - } - } else { - auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - col_vec.push_back(std::pair(e_input(i, j), j)); - } - } - std::sort(col_vec.begin(), col_vec.end(), - [&](const std::pair& l, const std::pair& r) { - if (descending) - return l.first > r.first; - else - return l.first < r.first; - }); - - for (Type j = 0; j < input_width; ++j) { - t_out[i * input_width + j] = col_vec[j].first; - t_indices[i * input_width + j] = col_vec[j].second; - } - } -} - -template -static void FullAssign(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, - const framework::Tensor* indices, T* t_out) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - if (input_dim == 1) { - auto e_input = EigenVector::Flatten(*input); - auto e_indices = EigenVector::Flatten(*indices); - for (Type j = 0; j < input_width; ++j) { - t_out[i * input_width + e_indices(j)] = e_input(j); - } - } else { - auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = EigenMatrix::Reshape(*indices, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - t_out[i * input_width + e_indices(i, j)] = e_input(i, j); - } - } - } -} - -template -class ArgsortKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int axis = ctx.Attr("axis"); - bool descending = ctx.Attr("descending"); - - auto in_dims = input->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - T* out_data = output->mutable_data(ctx.GetPlace()); - - // Do full sort - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - int64_t* ids_data = indices->mutable_data(ctx.GetPlace()); - FullSort(input_height, input_width, in_dims.size(), input, - out_data, ids_data, descending); - } else { - // If not full sort do transpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); - } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - - Tensor trans_inp; - trans_inp.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - auto& dev_ctx = ctx.template device_context(); - // Do transpose - TransCompute(ndims, dev_ctx, *input, - &trans_inp, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - output->mutable_data(ctx.GetPlace()); - - Tensor tmp_indices; - - auto* t_ind = - tmp_indices.mutable_data(trans_dims, ctx.GetPlace()); - - FullSort(input_height, input_width, in_dims.size(), - &trans_inp, t_out, t_ind, descending); - - indices->mutable_data(ctx.GetPlace()); - TransCompute( - ndims, dev_ctx, tmp_indices, indices, trans); - // transpose back - TransCompute(ndims, dev_ctx, tmp_out, - output, trans); - } - } -}; - -template -class ArgsortGradientKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* indices = ctx.Input("Indices"); - auto* dX = ctx.Output(framework::GradVarName("X")); - auto* dO = ctx.Input(framework::GradVarName("Out")); - int axis = ctx.Attr("axis"); - - auto in_dims = indices->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - dX->mutable_data(ctx.GetPlace()); - auto dxt = framework::EigenVector::Flatten(*dX); - auto& place = *ctx.template device_context() - .eigen_device(); - dxt.device(place) = dxt.constant(static_cast(0)); - if (dO->numel() == 0) return; - - // Do full assign - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - FullAssign(input_height, input_width, in_dims.size(), dO, - indices, dX->data()); - } else { - // If not full assign do transpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); - } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - - Tensor trans_dO; - trans_dO.mutable_data(trans_dims, ctx.GetPlace()); - Tensor trans_ind; - trans_ind.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - auto& dev_ctx = ctx.template device_context(); - // Do transpose - TransCompute(ndims, dev_ctx, *dO, - &trans_dO, trans); - TransCompute( - ndims, dev_ctx, *indices, &trans_ind, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - - FullAssign(input_height, input_width, in_dims.size(), - &trans_dO, &trans_ind, t_out); - - // transpose back - TransCompute(ndims, dev_ctx, tmp_out, dX, - trans); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc index 077be715bece0b4119dc0a578a1cba4631eb45f2..c927eec00bc8bf9e84ad1fb53a907ff8ec71acbc 100644 --- a/paddle/fluid/operators/argsort_op_npu.cc +++ b/paddle/fluid/operators/argsort_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/argsort_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/argsort_op_xpu.cc b/paddle/fluid/operators/argsort_op_xpu.cc index 18e81936a16c63a1d2693dfb47dc618c3e707ae0..359b00fcf87ee1bee27e668ae3973fa39be19d76 100644 --- a/paddle/fluid/operators/argsort_op_xpu.cc +++ b/paddle/fluid/operators/argsort_op_xpu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/argsort_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc index 72488a932d9c33cbfeddc9f35818e42ebe0137fa..b452dea8536dd98d6d4060d5224e39daf9137c50 100644 --- a/paddle/fluid/operators/assign_op_npu_test.cc +++ b/paddle/fluid/operators/assign_op_npu_test.cc @@ -23,7 +23,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/atan2_op.cc b/paddle/fluid/operators/atan2_op.cc index 71a895c244c54f62c0af1745635c08fea35436c4..0783b30a8580db403255211d879d9400a1e82ab7 100644 --- a/paddle/fluid/operators/atan2_op.cc +++ b/paddle/fluid/operators/atan2_op.cc @@ -105,8 +105,8 @@ class Atan2OpVarTypeInference : public framework::VarTypeInference { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(atan2, Atan2InferShapeFunctor, - PT_INFER_META(phi::Atan2InferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(atan2, Atan2InferShapeFunctor, + PD_INFER_META(phi::Atan2InferMeta)); REGISTER_OPERATOR(atan2, ops::Atan2Op, ops::Atan2OpMaker, ops::Atan2GradMaker, ops::Atan2GradMaker, diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index a23e484d0a88bb87febc6d320f9183ef50ea0ebc..78ea8b6b6fbebd7e0ca5ce14cc2cba6ff197177f 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -14,10 +14,10 @@ limitations under the License. */ #include "paddle/fluid/operators/attention_lstm_op.h" #include -#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/cpu_vec.h" namespace paddle { namespace operators { @@ -269,10 +269,10 @@ use lstm_x_t as input and compute as standard LSTM. template inline void bias_relu(const int n, const T* x, const T* bias, T* y) { if (bias) { - math::vec_add_bias(n, *bias, x, y); - math::vec_relu(n, y, y); + phi::funcs::vec_add_bias(n, *bias, x, y); + phi::funcs::vec_relu(n, y, y); } else { - math::vec_relu(n, x, y); + phi::funcs::vec_relu(n, x, y); } } @@ -283,14 +283,14 @@ inline void vec_softmax(const int n, const T* x, T* y) { for (int i = 1; i < n; ++i) { scalar = scalar < x[i] ? x[i] : scalar; } - math::vec_add_bias(n, -scalar, x, y); // sub - math::vec_exp(n, y, y); // exp + phi::funcs::vec_add_bias(n, -scalar, x, y); // sub + phi::funcs::vec_exp(n, y, y); // exp // sum scalar = T(0); for (int i = 0; i < n; ++i) { scalar += y[i]; } - math::vec_scal(n, static_cast(1) / scalar, y); // scale + phi::funcs::vec_scal(n, static_cast(1) / scalar, y); // scale } template @@ -344,12 +344,12 @@ class AttentionLSTMKernel : public framework::OpKernel { auto& act_cell_str = ctx.Attr("cell_activation"); auto& act_cand_str = ctx.Attr("candidate_activation"); if (platform::MayIUse(platform::avx)) { - math::VecActivations act_functor; + phi::funcs::VecActivations act_functor; act_gate = act_functor(act_gate_str); act_cell = act_functor(act_cell_str); act_cand = act_functor(act_cand_str); } else { - math::VecActivations act_functor; + phi::funcs::VecActivations act_functor; act_gate = act_functor(act_gate_str); act_cell = act_functor(act_cell_str); act_cand = act_functor(act_cand_str); diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 949cf021cf0fa322970c210fa26f698fd2bc45b2..174207deb08b84194d6f20fe04e4c27245295caf 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -1289,15 +1289,3 @@ REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp, ops::BatchNormDoubleGradMaker); REGISTER_OPERATOR(batch_norm_grad_grad, ops::BatchNormDoubleGradOp, ops::BatchNormDoubleGradOpInplaceInferer); - -REGISTER_OP_CPU_KERNEL( - batch_norm, ops::BatchNormKernel, - ops::BatchNormKernel); -REGISTER_OP_CPU_KERNEL( - batch_norm_grad, - ops::BatchNormGradKernel, - ops::BatchNormGradKernel); -REGISTER_OP_CPU_KERNEL( - batch_norm_grad_grad, - ops::BatchNormDoubleGradKernel, - ops::BatchNormDoubleGradKernel); diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index d59396db1517faadaa2dd9e9af770d2e8a23ec56..a19b087245a89a4a12f062b1ce27835b98ecfd66 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -41,1327 +41,5 @@ using CudnnDataType = platform::CudnnDataType; template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; -template -static __global__ void BNForwardInference( - const T *x, const BatchNormParamType *mean, - const BatchNormParamType *variance, const BatchNormParamType *scale, - const BatchNormParamType *bias, const int C, const int N, const int HxW, - const double epsilon, T *y) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - int num = N * C * HxW; - for (int i = gid; i < num; i += stride) { - const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; - BatchNormParamType x_sub_mean = - static_cast>(x[i]) - mean[c]; - BatchNormParamType inv_var = 1 / sqrt(variance[c] + epsilon); - y[i] = static_cast(scale[c] * x_sub_mean * inv_var + bias[c]); - } -} - -template -static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining( - const T *x, const BatchNormParamType *scale, - const BatchNormParamType *bias, const int C, const int N, const int HxW, - const double epsilon, double exponentialAverageFactor, T *y, - BatchNormParamType *mean, BatchNormParamType *variance, - BatchNormParamType *save_mean, - BatchNormParamType *save_inv_variance) { - int outer_size = C; - int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage mean_storage; - __shared__ typename BlockReduce::TempStorage variance_storeage; - __shared__ BatchNormParamType mean_val; - __shared__ BatchNormParamType variance_val; - __shared__ BatchNormParamType inv_var_val; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType x_sum = static_cast>(0); - BatchNormParamType x_square_sum = static_cast>(0); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType x_i = static_cast>(x[index]); - x_sum += x_i; - x_square_sum += x_i * x_i; - } - x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); - x_square_sum = - BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); - if (threadIdx.x == 0) { - mean_val = x_sum / inner_size; - variance_val = x_square_sum / inner_size - mean_val * mean_val; - inv_var_val = 1 / sqrt(variance_val + epsilon); - - if (save_mean && save_inv_variance) { - save_mean[i] = mean_val; - save_inv_variance[i] = inv_var_val; - } - mean[i] = (1 - exponentialAverageFactor) * mean_val + - exponentialAverageFactor * mean[i]; - variance[i] = (1 - exponentialAverageFactor) * variance_val + - exponentialAverageFactor * variance[i]; - } - __syncthreads(); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType x_sub_mean = - static_cast>(x[index]) - mean_val; - y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i]; - } - } -} - -template -class BatchNormKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("It must use CUDAPlace.")); - double epsilon = static_cast(ctx.Attr("epsilon")); - float momentum = ctx.Attr("momentum"); - const bool is_test = ctx.Attr("is_test"); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool trainable_stats = ctx.Attr("trainable_statistics"); - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - - bool test_mode = is_test && (!trainable_stats); - - // Get the size for each dimension. - // NCHW [batch_size, in_channels, in_height, in_width] - const auto *x = ctx.Input("X"); - const auto &x_dims = x->dims(); - PADDLE_ENFORCE_EQ( - x_dims.size() >= 2 && x_dims.size() <= 5, true, - platform::errors::InvalidArgument( - "The size of input's dimensions should be between 2 and 5" - "But received: the size of input's dimensions is [%d]", - x_dims.size())); - - auto *y = ctx.Output("Y"); - y->mutable_data(ctx.GetPlace()); - - int N, C, H, W, D; - ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); - - auto dtype = platform::CudnnDataType::type; - -#ifdef PADDLE_WITH_HIP - auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC - : DataLayout::kNCHW; - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// HIP do not support compute format of NHWC -// auto compute_format = DataLayout::kNCHW; -#else - const bool fast_nhwc_batch_norm = - test_mode || - (dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent); - - auto compute_format = - fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC - ? DataLayout::kNHWC - : DataLayout::kNCHW; -#endif - - Tensor transformed_x(x->type()); - Tensor transformed_y(y->type()); - if (data_layout == DataLayout::kNHWC && - compute_format == DataLayout::kNCHW && x_dims.size() > 2) { - VLOG(3) << "Transform input tensor from NHWC to NCHW."; - ResizeToChannelFirst(ctx, x, - &transformed_x); - TransToChannelFirst(ctx, x, - &transformed_x); - ResizeToChannelFirst(ctx, y, - &transformed_y); - } else { - transformed_x.ShareDataWith(*x); - transformed_y.ShareDataWith(*y); - } - -// ------------------- cudnn descriptors --------------------- -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// miopenTensorDescriptor_t data_desc_; -// miopenTensorDescriptor_t bn_param_desc_; -// miopenBatchNormMode_t mode_; - -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); -#else - cudnnTensorDescriptor_t data_desc_; - cudnnTensorDescriptor_t bn_param_desc_; - cudnnBatchNormMode_t mode_; - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); -#endif - - if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { - LOG(ERROR) << "Provided epsilon is smaller than " - << "CUDNN_BN_MIN_EPSILON. Setting it to " - << "CUDNN_BN_MIN_EPSILON instead."; - } - epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); - -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// mode_ = miopenBNSpatial; -#elif CUDNN_VERSION_MIN(7, 0, 1) - if (FLAGS_cudnn_batchnorm_spatial_persistent) { - mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; - } else if (H == 1 && W == 1) { - mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; - } else { - mode_ = CUDNN_BATCHNORM_SPATIAL; - } -#else - if (H == 1 && W == 1) { - mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; - } else { - mode_ = CUDNN_BATCHNORM_SPATIAL; - } -#endif // CUDNN_VERSION_MIN(7, 0, 1) - - VLOG(3) << "Setting descriptors."; - std::vector dims; - std::vector strides; - if (compute_format == DataLayout::kNCHW) { - dims = {N, C, H, W, D}; - strides = {C * H * W * D, H * W * D, W * D, D, 1}; - } else { - dims = {N, C, H, W, D}; - strides = {H * W * D * C, 1, W * D * C, D * C, C}; - } - -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( -// data_desc_, CudnnDataType::type, -// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), -// const_cast(strides.data()))); -// Note: PERSISTENT not implemented for inference -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDeriveBNTensorDescriptor( -// bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - // Note: PERSISTENT not implemented for inference - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor( - bn_param_desc_, data_desc_, - test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_)); -#endif - - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - - auto &dev_ctx = ctx.template device_context(); - - auto handle = dev_ctx.cudnn_handle(); - - // Now, depending on whether we are running test or not, we have two paths. - // It is training mode when it's not reference AND not using pre-trained - // model. - bool training = !test_mode && !use_global_stats; - if (!training) { - // only when test we use input to do computation. - const auto *est_mean = ctx.Input("Mean"); - const auto *est_var = ctx.Input("Variance"); - // Run inference mode. - PADDLE_ENFORCE_EQ( - est_mean->dims().size(), 1UL, - platform::errors::InvalidArgument( - "The size of mean's dimensions must equal to 1." - "But received: the size of mean's dimensions mean is [%d]," - "the dimensions of mean is [%s].", - est_mean->dims().size(), est_mean->dims())); - PADDLE_ENFORCE_EQ( - est_var->dims().size(), 1UL, - platform::errors::InvalidArgument( - "The size of variance's dimensions must equal to 1." - "But received: the size of variance's dimensions is [%d]," - "the dimensions of variance is [%s].", - est_var->dims().size(), est_var->dims())); - PADDLE_ENFORCE_EQ( - est_mean->dims()[0], C, - platform::errors::InvalidArgument( - "The first dimension of mean must equal to the number of " - "Channels, which is [%d]. But received: the first dimension" - "of mean is [%d], the dimensions of mean is [%s].", - C, est_mean->dims()[0], est_mean->dims())); - PADDLE_ENFORCE_EQ( - est_var->dims()[0], C, - platform::errors::InvalidArgument( - "The first dimension of variance must equal to the number" - "of Channels, which is [%d]. But received: the first dimension of" - "variance is [%d], the dimensions of variance is [%s].", - C, est_var->dims()[0], est_var->dims())); - -#ifdef PADDLE_WITH_HIP - const int block_size = 256; - const int grid_size = (N * C * H * W * D + block_size - 1) / block_size; - if (compute_format == DataLayout::kNCHW) { - BNForwardInference< - T, - DataLayout::kNCHW><<>>( - transformed_x.template data(), - est_mean->template data>(), - est_var->template data>(), - scale->template data>(), - bias->template data>(), C, N, H * W * D, - epsilon, transformed_y.template data()); - } else { - BNForwardInference< - T, - DataLayout::kNHWC><<>>( - transformed_x.template data(), - est_mean->template data>(), - est_var->template data>(), - scale->template data>(), - bias->template data>(), C, N, H * W * D, - epsilon, transformed_y.template data()); - } - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenBatchNormalizationForwardInference( -// handle, miopenBNSpatial, -// const_cast( -// static_cast(CudnnDataType::kOne())), -// const_cast( -// static_cast(CudnnDataType::kZero())), -// data_desc_, -// static_cast(transformed_x.template data()), -// data_desc_, -// static_cast( -// transformed_y.template mutable_data(ctx.GetPlace())), -// bn_param_desc_, -// const_cast(static_cast( -// scale->template data>())), -// const_cast(static_cast( -// bias->template data>())), -// const_cast(static_cast( -// est_mean->template data>())), -// const_cast(static_cast( -// est_var->template data>())), -// epsilon)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationForwardInference( - handle, - // Note: PERSISTENT not implemented for inference - CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_y.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - bias->template data>(), - est_mean->template data>(), - est_var->template data>(), epsilon)); -#endif - } else { - // if MomentumTensor is set, use MomentumTensor value, momentum - // is only used in this training branch - if (ctx.HasInput("MomentumTensor")) { - const auto *mom_tensor = ctx.Input("MomentumTensor"); - Tensor mom_cpu; - paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), - &mom_cpu); - momentum = mom_cpu.data()[0]; - } - - // Run training mode. - // obtain running mean and running inv var, and there is no need - // to initialize them. - - auto *mean_out = ctx.Output("MeanOut"); - auto *variance_out = ctx.Output("VarianceOut"); - mean_out->mutable_data>(ctx.GetPlace()); - variance_out->mutable_data>(ctx.GetPlace()); - - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_variance = ctx.Output("SavedVariance"); - saved_mean->mutable_data>(ctx.GetPlace()); - saved_variance->mutable_data>(ctx.GetPlace()); - - if ((N * H * W * D) == 1) { - // Only 1 element in normalization dimension, - // skip the batch norm calculation, let y = x. - framework::TensorCopy(*x, ctx.GetPlace(), y); - } else { - double this_factor = 1. - momentum; - - bool called = false; -#if CUDNN_VERSION_MIN(7, 4, 1) - called = true; - size_t workspace_size = 0; - size_t reserve_space_size = 0; - void *reserve_space_ptr = nullptr; - void *workspace_ptr = nullptr; - Tensor workspace_tensor; - // Create reserve space and workspace for batch norm. - // Create tensor for each batchnorm op, it will be used in the - // backward. Thus this tensor shouldn't be temp. - auto *reserve_space = ctx.Output("ReserveSpace"); - PADDLE_ENFORCE_NOT_NULL( - reserve_space, - platform::errors::NotFound( - "The argument ReserveSpace of batch_norm op is not found.")); - - // --------------- cudnn batchnorm workspace --------------- - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( - /*handle=*/handle, - /*mode=*/mode_, - /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, - /*xDesc=*/data_desc_, - /*zDesc=*/nullptr, - /*yDesc=*/data_desc_, - /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, - /*activationDesc=*/nullptr, - /*sizeInBytes=*/&workspace_size)); - - // -------------- cudnn batchnorm reserve space -------------- - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnGetBatchNormalizationTrainingExReserveSpaceSize( - /*handle=*/handle, - /*mode=*/mode_, - /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, - /*activationDesc=*/nullptr, - /*xDesc=*/data_desc_, - /*sizeInBytes=*/&reserve_space_size)); - - reserve_space_ptr = reserve_space->mutable_data( - ctx.GetPlace(), transformed_x.type(), reserve_space_size); - workspace_ptr = workspace_tensor.mutable_data( - ctx.GetPlace(), transformed_x.type(), workspace_size); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationForwardTrainingEx( - handle, mode_, CUDNN_BATCHNORM_OPS_BN, CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), nullptr, nullptr, data_desc_, - transformed_y.template data(), bn_param_desc_, - scale->template data>(), - bias->template data>(), this_factor, - mean_out->template mutable_data>( - ctx.GetPlace()), - variance_out->template mutable_data>( - ctx.GetPlace()), - epsilon, - saved_mean->template mutable_data>( - ctx.GetPlace()), - saved_variance->template mutable_data>( - ctx.GetPlace()), - nullptr, workspace_ptr, workspace_size, reserve_space_ptr, - reserve_space_size)); -#endif // CUDNN_VERSION_MIN(7, 4, 1) - if (!called) { -#ifdef PADDLE_WITH_HIP - const int num = transformed_x.numel(); - const int block = 256; - const int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block, 1); - const int grid = std::min(C, max_blocks); - if (compute_format == DataLayout::kNCHW) { - BNForwardTraining< - T, block, - DataLayout::kNCHW><<>>( - transformed_x.template data(), - scale->template data>(), - bias->template data>(), C, N, H * W * D, - epsilon, this_factor, transformed_y.template data(), - mean_out->template data>(), - variance_out->template data>(), - saved_mean->template data>(), - saved_variance->template data>()); - } else { - BNForwardTraining< - T, block, - DataLayout::kNHWC><<>>( - transformed_x.template data(), - scale->template data>(), - bias->template data>(), C, N, H * W * D, - epsilon, this_factor, transformed_y.template data(), - mean_out->template data>(), - variance_out->template data>(), - saved_mean->template data>(), - saved_variance->template data>()); - } - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenBatchNormalizationForwardTraining( -// handle, mode_, const_cast(static_cast( -// CudnnDataType::kOne())), -// const_cast( -// static_cast(CudnnDataType::kZero())), -// data_desc_, -// static_cast(transformed_x.template data()), -// data_desc_, -// static_cast( -// transformed_y.template mutable_data(ctx.GetPlace())), -// bn_param_desc_, -// const_cast(static_cast( -// scale->template data>())), -// const_cast(static_cast( -// bias->template data>())), -// this_factor, -// static_cast( -// mean_out->template mutable_data>( -// ctx.GetPlace())), -// static_cast(variance_out->template mutable_data< -// BatchNormParamType>(ctx.GetPlace())), -// epsilon, -// static_cast( -// saved_mean->template mutable_data>( -// ctx.GetPlace())), -// static_cast(saved_variance->template mutable_data< -// BatchNormParamType>(ctx.GetPlace())))); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationForwardTraining( - handle, mode_, CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_y.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - bias->template data>(), this_factor, - mean_out->template mutable_data>( - ctx.GetPlace()), - variance_out->template mutable_data>( - ctx.GetPlace()), - epsilon, - saved_mean->template mutable_data>( - ctx.GetPlace()), - saved_variance->template mutable_data>( - ctx.GetPlace()))); -#endif - } - } - } - - if (data_layout == DataLayout::kNHWC && - compute_format == DataLayout::kNCHW && x_dims.size() > 2) { - VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; - TransToChannelLast( - ctx, &transformed_y, y); - } -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// clean when exit. -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); -#else - // clean when exit. - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); -#endif - } -}; - -template -static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias( - const T *dy, const T *x, const BatchNormParamType *mean, - const BatchNormParamType *variance, const double epsilon, const int N, - const int C, const int HxW, BatchNormParamType *dscale, - BatchNormParamType *dbias) { - const int outer_size = C; - const int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage ds_storage; - __shared__ typename BlockReduce::TempStorage db_storage; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType ds_sum = static_cast>(0); - BatchNormParamType db_sum = static_cast>(0); - - BatchNormParamType inv_var_i = 1.0 / sqrt(variance[i] + epsilon); - BatchNormParamType mean_i = mean[i]; - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - ds_sum += static_cast>(dy[index]) * - (static_cast>(x[index]) - mean_i); - db_sum += static_cast>(dy[index]); - } - ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); - db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); - if (threadIdx.x == 0) { - dscale[i] = ds_sum * inv_var_i; - dbias[i] = db_sum; - } - __syncthreads(); - } -} - -template -static __global__ void KeBNBackwardData(const T *dy, - const BatchNormParamType *scale, - const BatchNormParamType *variance, - const double epsilon, const int C, - const int HxW, const int num, T *dx) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (int i = gid; i < num; i += stride) { - const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; - BatchNormParamType inv_var = 1.0 / sqrt(variance[c] + epsilon); - dx[i] = static_cast(static_cast>(dy[i]) * - scale[c] * inv_var); - } -} - -template -static __global__ void KeBNRestoreData(const framework::DataLayout layout, T *x, - const BatchNormParamType *scale, - const BatchNormParamType *bias, - const BatchNormParamType *mean, - const BatchNormParamType *variance, - double epsilon, int C, int M, - const int num, const T *y) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (int i = gid; i < num; i += stride) { - const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C; - auto y_i = static_cast>(y[i]); - auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c]; - x[i] = static_cast(x_i); - } -} - -template -class InplaceHelper { - public: - void operator()(const framework::DataLayout layout, T *x, - const BatchNormParamType *scale, - const BatchNormParamType *bias, - const BatchNormParamType *mean, - const BatchNormParamType *variance, double epsilon, int C, - int M, const int num, const T *y, int grid2, const int block, - const gpuStream_t &stream) { - PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument( - "X and Y should be inplaced in inplace mode")); - KeBNRestoreData<<>>( - layout, x, scale, bias, mean, variance, epsilon, C, M, num, y); - } -}; - -template -static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward( - const T *dy, const T *x, const BatchNormParamType *scale, - const BatchNormParamType *saved_mean, - const BatchNormParamType *saved_inv_variance, const int C, const int N, - const int HxW, const double epsilon, T *dx, BatchNormParamType *dscale, - BatchNormParamType *dbias) { - const int outer_size = C; - const int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage ds_storage; - __shared__ typename BlockReduce::TempStorage db_storage; - __shared__ typename BlockReduce::TempStorage mean_storage; - __shared__ typename BlockReduce::TempStorage variance_storeage; - __shared__ BatchNormParamType inv_var_val; - __shared__ BatchNormParamType mean_val; - __shared__ BatchNormParamType dscale_val; - __shared__ BatchNormParamType dbias_val; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType ds_sum = static_cast>(0); - BatchNormParamType db_sum = static_cast>(0); - - if (saved_mean && saved_inv_variance) { - if (threadIdx.x == 0) { - inv_var_val = saved_inv_variance[i]; - mean_val = saved_mean[i]; - } - } else { - BatchNormParamType x_sum = static_cast>(0); - BatchNormParamType x_square_sum = - static_cast>(0); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType x_i = - static_cast>(x[index]); - x_sum += x_i; - x_square_sum += x_i * x_i; - } - x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); - x_square_sum = - BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); - if (threadIdx.x == 0) { - mean_val = x_sum / inner_size; - inv_var_val = - 1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon); - } - } - __syncthreads(); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType dy_i = - static_cast>(dy[index]); - ds_sum += - dy_i * (static_cast>(x[index]) - mean_val); - db_sum += dy_i; - } - ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); - db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); - if (threadIdx.x == 0) { - dscale_val = ds_sum * inv_var_val; - dbias_val = db_sum; - dscale[i] = dscale_val; - dbias[i] = dbias_val; - } - __syncthreads(); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - dx[index] = scale[i] * inv_var_val * - (static_cast>(dy[index]) - - dbias_val / static_cast>(inner_size) - - (static_cast>(x[index]) - mean_val) * - inv_var_val * dscale_val / inner_size); - } - } -} - -template -static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData( - const T *dy, const BatchNormParamType *scale, - const BatchNormParamType *mean, const T *x, - const BatchNormParamType *variance, const int C, const int N, - const int HxW, T *dx) { - const int outer_size = C; - const int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage dy_storage; - __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage; - __shared__ BatchNormParamType dy_sum_val; - __shared__ BatchNormParamType dy_x_sub_mean_sum_val; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType inv_var_i = variance[i]; - BatchNormParamType mean_i = mean[i]; - BatchNormParamType dy_sum = static_cast>(0); - BatchNormParamType dy_x_sub_mean_sum = - static_cast>(0); - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType dy_i = - static_cast>(dy[index]); - dy_sum += dy_i; - dy_x_sub_mean_sum += - dy_i * (static_cast>(x[index]) - mean_i); - } - - dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); - dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage) - .Reduce(dy_x_sub_mean_sum, cub::Sum()); - - if (threadIdx.x == 0) { - dy_sum_val = dy_sum; - dy_x_sub_mean_sum_val = dy_x_sub_mean_sum; - } - __syncthreads(); - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - dx[index] = - (static_cast>(dy[index]) - - dy_sum_val / static_cast>(inner_size) - - (static_cast>(x[index]) - mean_i) * - dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) * - scale[i] * inv_var_i; - } - } -} - -template -class BatchNormGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("It must use CUDAPlace.")); - double epsilon = static_cast(ctx.Attr("epsilon")); - const std::string data_layout_str = ctx.Attr("data_layout"); - bool use_global_stats = ctx.Attr("use_global_stats"); - - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const auto *d_y = ctx.Input(framework::GradVarName("Y")); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - // batch_norm with inplace as false will take X as grad input, which - // is same as cuDNN batch_norm backward calculation, batch_norm - // with inplace as true only take Y as input and X should be calculate - // by inverse operation of batch_norm on Y - const Tensor *x; - bool is_inplace; - if (ctx.HasInput("Y")) { - x = ctx.Input("Y"); - is_inplace = true; - if (d_x) { - PADDLE_ENFORCE_EQ(d_x, d_y, - platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD not inplace in inplace mode")); - } - } else { - x = ctx.Input("X"); - is_inplace = false; - if (d_x) { - PADDLE_ENFORCE_NE( - d_x, d_y, platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD inplaced in non-inplace mode")); - } - } - - const bool is_test = ctx.Attr("is_test"); - use_global_stats = is_test || use_global_stats; - - const auto &x_dims = x->dims(); - - PADDLE_ENFORCE_EQ( - x_dims.size() >= 2 && x_dims.size() <= 5, true, - platform::errors::InvalidArgument( - "The size of input's dimensions should be between 2 and 5." - "But received: the size of input's dimensions is [%d]," - "the dimensions of input is [%s]", - x_dims.size(), x_dims)); - int N, C, H, W, D; - ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); - - // init output - if (d_x) { - d_x->mutable_data(ctx.GetPlace()); - } - - if (d_scale && d_bias) { - d_scale->mutable_data>(ctx.GetPlace()); - d_bias->mutable_data>(ctx.GetPlace()); - } - PADDLE_ENFORCE_EQ( - scale->dims().size(), 1UL, - platform::errors::InvalidArgument( - "The size of scale's dimensions must equal to 1. But received: " - "the size of scale's dimensions is [%d], the dimensions of scale " - "is [%s].", - scale->dims().size(), scale->dims())); - PADDLE_ENFORCE_EQ( - scale->dims()[0], C, - platform::errors::InvalidArgument( - "The first dimension of scale must equal to Channels[%d]. But " - "received: the first dimension of scale is [%d]", - C, scale->dims()[0])); - - auto dtype = platform::CudnnDataType::type; - const auto *reserve_space = ctx.Input("ReserveSpace"); -#ifdef PADDLE_WITH_HIP - auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC - : DataLayout::kNCHW; - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// HIP do not support compute format of NHWC -// auto compute_format = DataLayout::kNCHW; -#else - const bool fast_nhwc_batch_norm = - dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent && - reserve_space != nullptr; - auto compute_format = - fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC - ? DataLayout::kNHWC - : DataLayout::kNCHW; -#endif - - Tensor transformed_x(x->type()); - Tensor transformed_d_y(d_y->type()); - Tensor transformed_d_x; - if (data_layout == DataLayout::kNHWC && - compute_format == DataLayout::kNCHW && x_dims.size() > 2) { - VLOG(3) << "Transform input tensor from NHWC to NCHW."; - ResizeToChannelFirst(ctx, x, - &transformed_x); - TransToChannelFirst(ctx, x, - &transformed_x); - ResizeToChannelFirst(ctx, d_y, - &transformed_d_y); - TransToChannelFirst(ctx, d_y, - &transformed_d_y); - if (d_x) { - ResizeToChannelFirst(ctx, d_x, - &transformed_d_x); - } - } else { - transformed_x.ShareDataWith(*x); - transformed_d_y.ShareDataWith(*d_y); - if (d_x) { - transformed_d_x.ShareDataWith(*d_x); - } - } - - std::vector dims; - std::vector strides; - if (compute_format == DataLayout::kNCHW) { - dims = {N, C, H, W, D}; - strides = {C * H * W * D, H * W * D, W * D, D, 1}; - } else { - dims = {N, C, H, W, D}; - strides = {H * W * C * D, 1, W * D * C, D * C, C}; - } - - auto &dev_ctx = ctx.template device_context(); - const int num = transformed_x.numel(); -#ifdef HIPCC - const int block = 256; -#else - const int block = 512; -#endif - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block, 1); - int grid1 = (num + block - 1) / block; - int grid2 = std::min(C, max_blocks); - auto stream = dev_ctx.stream(); - InplaceHelper inplace_functor; - - if (!use_global_stats) { - if ((N * H * W * D) == 1) { - if (d_x) { - framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); - } - phi::funcs::SetConstant> - functor; - functor(dev_ctx, d_scale, static_cast>(0)); - functor(dev_ctx, d_bias, static_cast>(0)); - return; - } - -// ------------------- cudnn descriptors --------------------- -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// miopenTensorDescriptor_t data_desc_; -// miopenTensorDescriptor_t bn_param_desc_; -// miopenBatchNormMode_t mode_; - -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); -#else - cudnnTensorDescriptor_t data_desc_; - cudnnTensorDescriptor_t bn_param_desc_; - cudnnBatchNormMode_t mode_; - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); -#endif - if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { - LOG(ERROR) << "Provided epsilon is smaller than " - << "CUDNN_BN_MIN_EPSILON. Setting it to " - << "CUDNN_BN_MIN_EPSILON instead."; - } - epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// mode_ = miopenBNSpatial; -#elif CUDNN_VERSION_MIN(7, 0, 1) - if (FLAGS_cudnn_batchnorm_spatial_persistent) { - mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; - } else if (H == 1 && W == 1) { - mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; - } else { - mode_ = CUDNN_BATCHNORM_SPATIAL; - } -#else - if (H == 1 && W == 1) { - mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; - } else { - mode_ = CUDNN_BATCHNORM_SPATIAL; - } -#endif // CUDNN_VERSION_MIN(7, 0, 1) - -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( -// data_desc_, CudnnDataType::type, -// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), -// const_cast(strides.data()))); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_, -// data_desc_, mode_)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, - data_desc_, mode_)); -#endif - - const auto *saved_mean = ctx.Input("SavedMean"); - const auto *saved_var = ctx.Input("SavedVariance"); - const auto *saved_mean_data = - saved_mean->template data>(); - const auto *saved_var_data = - saved_var->template data>(); - - if (is_inplace) { - inplace_functor(compute_format, transformed_x.data(), - scale->template data>(), - bias->template data>(), - saved_mean_data, saved_var_data, epsilon, C, H * W * D, - num, transformed_x.data(), grid2, block, stream); - } - - // This branch calls CUDNN APIs - if (d_x && d_scale && d_bias) { - bool called = false; -#if CUDNN_VERSION_MIN(7, 4, 1) - called = true; - size_t workspace_size = 0; - void *workspace_ptr = nullptr; - Tensor workspace_tensor; - auto reserve_space_size = reserve_space->memory_size(); - // --------------- cudnn batchnorm workspace --------------- - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnGetBatchNormalizationBackwardExWorkspaceSize( - /*handle=*/dev_ctx.cudnn_handle(), - /*mode=*/mode_, - /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, - /*xDesc=*/data_desc_, - /*yDesc=*/data_desc_, - /*dyDesc=*/data_desc_, - /*dzDesc=*/nullptr, - /*dxDesc=*/data_desc_, - /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, - /*activationDesc=*/nullptr, - /*sizeInBytes=*/&workspace_size)); - - workspace_ptr = workspace_tensor.mutable_data( - ctx.GetPlace(), transformed_x.type(), workspace_size); - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationBackwardEx( - /*handle=*/dev_ctx.cudnn_handle(), - /*mode=*/mode_, - /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, - /*alphaDataDiff=*/CudnnDataType::kOne(), - /*betaDataDiff=*/CudnnDataType::kZero(), - /*alphaParamDiff=*/CudnnDataType::kOne(), - /*betaParamDiff=*/CudnnDataType::kZero(), - /*xDesc=*/data_desc_, - /*xData=*/transformed_x.template data(), - /*yDesc=*/nullptr, - /*yData=*/nullptr, - /*dyDesc=*/data_desc_, - /*dyData=*/transformed_d_y.template data(), - /*dzDesc=*/nullptr, - /*dzData=*/nullptr, - /*dxDesc=*/data_desc_, - /*dxData=*/transformed_d_x.template mutable_data( - ctx.GetPlace()), - /*dBnScaleBiasDesc=*/bn_param_desc_, - /*bnScaleData=*/scale->template data>(), - /*bnBiasData=*/nullptr, - /*dBnScaleData=*/d_scale - ->template mutable_data>( - ctx.GetPlace()), - /*dBnBiasData=*/d_bias - ->template mutable_data>( - ctx.GetPlace()), - /*epsilon=*/epsilon, - /*savedMean=*/saved_mean_data, - /*savedInvVariance=*/saved_var_data, - /*activationDesc=*/nullptr, - /*workspace=*/workspace_ptr, - /*workSpaceSizeInBytes=*/workspace_size, - /*reserveSpace=*/const_cast( - reserve_space->template data()), - /*reserveSpaceSizeInBytes=*/reserve_space_size)); -#endif // CUDNN_VERSION_MIN(7, 4, 1) - if (!called) { -#ifdef PADDLE_WITH_HIP - if (compute_format == DataLayout::kNCHW) { - BNBackward< - T, block, - DataLayout::kNCHW><<>>( - transformed_d_y.template data(), - transformed_x.template data(), - scale->template data>(), saved_mean_data, - saved_var_data, C, N, H * W * D, epsilon, - transformed_d_x.template data(), - d_scale->template mutable_data>( - ctx.GetPlace()), - d_bias->template mutable_data>( - ctx.GetPlace())); - } else { - BNBackward< - T, block, - DataLayout::kNHWC><<>>( - transformed_d_y.template data(), - transformed_x.template data(), - scale->template data>(), saved_mean_data, - saved_var_data, C, N, H * W * D, epsilon, - transformed_d_x.template data(), - d_scale->template mutable_data>( - ctx.GetPlace()), - d_bias->template mutable_data>( - ctx.GetPlace())); - } - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenBatchNormalizationBackward( -// dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), -// CudnnDataType::kZero(), CudnnDataType::kOne(), -// CudnnDataType::kZero(), data_desc_, -// transformed_x.template data(), data_desc_, -// transformed_d_y.template data(), data_desc_, -// transformed_d_x.template mutable_data(ctx.GetPlace()), -// bn_param_desc_, scale->template data>(), -// d_scale->template mutable_data>( -// ctx.GetPlace()), -// d_bias->template mutable_data>( -// ctx.GetPlace()), -// epsilon, saved_mean_data, saved_var_data)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationBackward( - dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), - CudnnDataType::kZero(), CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_d_y.template data(), data_desc_, - transformed_d_x.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - d_scale->template mutable_data>( - ctx.GetPlace()), - d_bias->template mutable_data>( - ctx.GetPlace()), - epsilon, saved_mean_data, saved_var_data)); -#endif - } - - if (data_layout == DataLayout::kNHWC && - compute_format == DataLayout::kNCHW) { - VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; - TransToChannelLast( - ctx, &transformed_d_x, d_x); - } - } else { - // This branch call CUDA kernels - if (compute_format == DataLayout::kNCHW) { - if (d_x) { - BNBackwardData<<< - grid2, block, 0, dev_ctx.stream()>>>( - d_y->data(), scale->data>(), - saved_mean_data, x->data(), saved_var_data, C, N, H * W * D, - d_x->data()); - } - if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, block, - framework::DataLayout::kNCHW><<>>( - d_y->data(), x->data(), saved_mean_data, saved_var_data, - epsilon, N, C, H * W * D, - d_scale->data>(), - d_bias->data>()); - } - } else { - if (d_x) { - BNBackwardData<<< - grid2, block, 0, dev_ctx.stream()>>>( - d_y->data(), scale->data>(), - saved_mean_data, x->data(), saved_var_data, C, N, H * W * D, - d_x->data()); - } - if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, block, - framework::DataLayout::kNHWC><<>>( - d_y->data(), x->data(), saved_mean_data, saved_var_data, - epsilon, N, C, H * W * D, - d_scale->data>(), - d_bias->data>()); - } - } - } - -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// clean when exit. -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); -#else - // clean when exit. - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); -#endif - } else { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_var = ctx.Input("Variance"); - - const auto *running_mean_data = - running_mean->template data>(); - const auto *running_var_data = - running_var->template data>(); - - if (is_inplace) { - auto px = *x; - inplace_functor(data_layout, px.mutable_data(ctx.GetPlace()), - scale->template data>(), - bias->template data>(), - running_mean_data, running_var_data, epsilon, C, - H * W * D, num, x->data(), grid2, block, stream); - } - - if (compute_format == DataLayout::kNCHW) { - if (d_x) { - KeBNBackwardData< - T, framework::DataLayout::kNCHW><<>>( - d_y->data(), scale->data>(), - running_var_data, epsilon, C, H * W, num, d_x->data()); - } - if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, block, - framework::DataLayout::kNCHW><<>>( - d_y->data(), x->data(), running_mean_data, running_var_data, - epsilon, N, C, H * W * D, d_scale->data>(), - d_bias->data>()); - } - } else { - if (d_x) { - KeBNBackwardData< - T, framework::DataLayout::kNHWC><<>>( - d_y->data(), scale->data>(), - running_var_data, epsilon, C, H * W, num, d_x->data()); - } - if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, block, - framework::DataLayout::kNHWC><<>>( - d_y->data(), x->data(), running_mean_data, running_var_data, - epsilon, N, C, H * W * D, d_scale->data>(), - d_bias->data>()); - } - } - } - } -}; - -template -class BatchNormDoubleGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *X = ctx.Input("X"); - const auto *Scale = ctx.Input("Scale"); - const auto *dY = ctx.Input("DY"); - const auto *Saved_mean = ctx.Input("SavedMean"); - const auto *Saved_variance = ctx.Input("SavedVariance"); - const double epsilon = static_cast(ctx.Attr("epsilon")); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool is_test = ctx.Attr("is_test"); - - PADDLE_ENFORCE_EQ( - is_test, false, - platform::errors::InvalidArgument( - "`is_test = True` CANNOT be used in train program. If " - "you want to use global status in pre_train model, " - "please set `use_global_stats = True`")); - - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - - const auto *ddX = ctx.Input("DDX"); - const auto *ddScale = ctx.Input("DDScale"); - const auto *ddBias = ctx.Input("DDBias"); - - auto *dX = ctx.Output("DX"); - auto *dScale = ctx.Output("DScale"); - auto *ddY = ctx.Output("DDY"); - - NormDoubleGradFunctor( - ctx, data_layout, X, Scale, dY, Saved_mean, Saved_variance, epsilon, - use_global_stats, ddX, ddScale, ddBias, dX, dScale, ddY); - } -}; - } // namespace operators } // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_CUDA_KERNEL( - batch_norm, ops::BatchNormKernel, - ops::BatchNormKernel); -REGISTER_OP_CUDA_KERNEL( - batch_norm_grad, ops::BatchNormGradKernel, - ops::BatchNormGradKernel); -REGISTER_OP_CUDA_KERNEL( - batch_norm_grad_grad, - ops::BatchNormDoubleGradKernel); -#else -REGISTER_OP_CUDA_KERNEL( - batch_norm, ops::BatchNormKernel, - ops::BatchNormKernel, - ops::BatchNormKernel); -REGISTER_OP_CUDA_KERNEL( - batch_norm_grad, ops::BatchNormGradKernel, - ops::BatchNormGradKernel, - ops::BatchNormGradKernel); -REGISTER_OP_CUDA_KERNEL( - batch_norm_grad_grad, - ops::BatchNormDoubleGradKernel, - ops::BatchNormDoubleGradKernel); -#endif diff --git a/paddle/fluid/operators/bce_loss_op.cc b/paddle/fluid/operators/bce_loss_op.cc index 55bb57466c7b5ec4f4ac3c51b1cf84ab5098a0e9..bc9076f4d7c368f60187e9e432dd175d1f5ad45b 100644 --- a/paddle/fluid/operators/bce_loss_op.cc +++ b/paddle/fluid/operators/bce_loss_op.cc @@ -138,8 +138,8 @@ DECLARE_INPLACE_OP_INFERER(BCELossGradInplaceInferer, } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(bce_loss, BCELossInferShapeFunctor, - PT_INFER_META(phi::BCELossInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(bce_loss, BCELossInferShapeFunctor, + PD_INFER_META(phi::BCELossInferMeta)); REGISTER_OPERATOR(bce_loss, ops::BCELossOp, ops::BCELossOpMaker, ops::BCELossGradOpMaker, diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc index 4774c0a1dbc3b78607d75efb7bc82d590ca4aa2a..9f6a78ab7a55f32558accd56e69d757003bad89c 100644 --- a/paddle/fluid/operators/bilinear_tensor_product_op.cc +++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc @@ -90,12 +90,12 @@ class BilinearTensorProductGradOpMaker namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(bilinear_tensor_product, +DECLARE_INFER_SHAPE_FUNCTOR(bilinear_tensor_product, BilinearTensorProductInferShapeFunctor, - PT_INFER_META(phi::BilinearTensorProductInferMeta)); -DELCARE_INFER_SHAPE_FUNCTOR( + PD_INFER_META(phi::BilinearTensorProductInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR( bilinear_tensor_product_grad, BilinearTensorProductGradInferShapeFunctor, - PT_INFER_META(phi::BilinearTensorProductGradInferMeta)); + PD_INFER_META(phi::BilinearTensorProductGradInferMeta)); REGISTER_OPERATOR( bilinear_tensor_product, ops::BilinearTensorProductOp, diff --git a/paddle/fluid/operators/bincount_op.cc b/paddle/fluid/operators/bincount_op.cc index b37334a14bad4fdc342d8fba13c117bfad5bd65c..062e7d510d54c0f657582d48844093d94732971e 100644 --- a/paddle/fluid/operators/bincount_op.cc +++ b/paddle/fluid/operators/bincount_op.cc @@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/bincount_op.h" - #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -28,51 +31,6 @@ class BincountOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of BincountOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of BincountOp should not be null.")); - - auto input_dim = ctx->GetInputDim("X"); - auto minlength = ctx->Attrs().Get("minlength"); - - PADDLE_ENFORCE_GE(minlength, 0, - platform::errors::InvalidArgument( - "The minlength should be greater than or equal to 0." - "But received minlength is %d", - minlength)); - - PADDLE_ENFORCE_EQ(input_dim.size(), 1, - platform::errors::InvalidArgument( - "The 'shape' of Input(X) must be 1-D tensor." - "But the dimension of Input(X) is [%d]", - input_dim.size())); - - if (ctx->HasInput("Weights")) { - auto weights_dim = ctx->GetInputDim("Weights"); - PADDLE_ENFORCE_EQ(weights_dim.size(), 1, - platform::errors::InvalidArgument( - "The 'shape' of Input(Weights) must be 1-D tensor." - "But the dimension of Input(Weights) is [%d]", - weights_dim.size())); - - PADDLE_ENFORCE_EQ( - weights_dim[0], input_dim[0], - platform::errors::InvalidArgument( - "The 'shape' of Input(Weights) must be equal to the 'shape' of " - "Input(X)." - "But received: the 'shape' of Input(Weights) is [%s]," - "the 'shape' of Input(X) is [%s]", - weights_dim, input_dim)); - } - - ctx->SetOutputDim("Out", phi::make_ddim({-1})); - ctx->ShareLoD("X", /*->*/ "Out"); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const { auto data_type = @@ -105,12 +63,10 @@ class BincountOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(bincount, BincountInferShapeFunctor, + PD_INFER_META(phi::BincountInferMeta)); REGISTER_OPERATOR( bincount, ops::BincountOp, ops::BincountOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - bincount, ops::BincountKernel, - ops::BincountKernel, - ops::BincountKernel, - ops::BincountKernel); + paddle::framework::EmptyGradOpMaker, + BincountInferShapeFunctor); diff --git a/paddle/fluid/operators/bincount_op.cu b/paddle/fluid/operators/bincount_op.cu deleted file mode 100644 index cc576d0af92877dff44d672597596036be0defbc..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/bincount_op.cu +++ /dev/null @@ -1,162 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/operators/bincount_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using platform::PADDLE_CUDA_NUM_THREADS; - -inline int GET_BLOCKS(const int N) { - return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; -} - -template -__global__ void KernelBincount(const InputT* input, const int total_elements, - const bool has_weights, const T* weights, - OutT* output) { - if (!has_weights) { - for (int i = threadIdx.x; i < total_elements; i += blockDim.x) { - paddle::platform::CudaAtomicAdd(&output[input[i]], 1L); - } - } else { - for (int i = threadIdx.x; i < total_elements; i += blockDim.x) { - paddle::platform::CudaAtomicAdd(&output[input[i]], - static_cast(weights[i])); - } - } -} - -template -void BincountCUDAInner(const framework::ExecutionContext& context) { - const Tensor* input = context.Input("X"); - const Tensor* weights = context.Input("Weights"); - Tensor* output = context.Output("Out"); - auto& minlength = context.Attr("minlength"); - - const InputT* input_data = input->data(); - - const int input_numel = input->numel(); - - if (input_data == nullptr) { - framework::DDim out_dim{0}; - output->Resize(out_dim); - output->mutable_data(context.GetPlace()); - return; - } - auto input_x = framework::EigenVector::Flatten(*input); - - framework::Tensor input_min_t, input_max_t; - auto* input_max_data = - input_max_t.mutable_data({1}, context.GetPlace()); - auto* input_min_data = - input_min_t.mutable_data({1}, context.GetPlace()); - - auto input_max_scala = framework::EigenScalar::From(input_max_t); - auto input_min_scala = framework::EigenScalar::From(input_min_t); - - auto* place = context.template device_context().eigen_device(); - input_max_scala.device(*place) = input_x.maximum(); - input_min_scala.device(*place) = input_x.minimum(); - - Tensor input_min_cpu, input_max_cpu; - paddle::framework::TensorCopySync(input_max_t, platform::CPUPlace(), - &input_max_cpu); - paddle::framework::TensorCopySync(input_min_t, platform::CPUPlace(), - &input_min_cpu); - - InputT input_min = input_min_cpu.data()[0]; - - PADDLE_ENFORCE_GE( - input_min, static_cast(0), - platform::errors::InvalidArgument( - "The elements in input tensor must be non-negative ints")); - - int64_t output_size = - static_cast(input_max_cpu.data()[0]) + 1L; - - output_size = std::max(output_size, static_cast(minlength)); - framework::DDim out_dim{output_size}; - output->Resize(out_dim); - - bool has_weights = (weights != nullptr); - - const T* weights_data = has_weights ? weights->data() : nullptr; - - auto stream = - context.template device_context().stream(); - - if (!has_weights) { - int64_t* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, 0L); - - KernelBincount<<>>( - input_data, input_numel, has_weights, weights_data, output_data); - } else { - const auto& weights_type = framework::TransToProtoVarType(weights->dtype()); - - if (weights_type == framework::proto::VarType::FP32) { - float* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - - KernelBincount<<>>( - input_data, input_numel, has_weights, weights_data, output_data); - } else { - double* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - - KernelBincount<<>>( - input_data, input_numel, has_weights, weights_data, output_data); - } - } -} - -template -class BincountCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("X"); - const auto& input_type = framework::TransToProtoVarType(input->dtype()); - - if (input_type == framework::proto::VarType::INT32) { - BincountCUDAInner(context); - } else if (input_type == framework::proto::VarType::INT64) { - BincountCUDAInner(context); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - bincount, ops::BincountCUDAKernel, - ops::BincountCUDAKernel, - ops::BincountCUDAKernel, - ops::BincountCUDAKernel); diff --git a/paddle/fluid/operators/bincount_op.h b/paddle/fluid/operators/bincount_op.h deleted file mode 100644 index 84256bf78e4a1901b76b356c5e3274541dc0dd59..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/bincount_op.h +++ /dev/null @@ -1,109 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -void BincountInner(const framework::ExecutionContext& context) { - const Tensor* input = context.Input("X"); - const Tensor* weights = context.Input("Weights"); - Tensor* output = context.Output("Out"); - auto& minlength = context.Attr("minlength"); - - const InputT* input_data = input->data(); - - auto input_numel = input->numel(); - - if (input_data == nullptr) { - framework::DDim out_dim{0}; - output->Resize(out_dim); - output->mutable_data(context.GetPlace()); - return; - } - - PADDLE_ENFORCE_GE( - *std::min_element(input_data, input_data + input_numel), - static_cast(0), - platform::errors::InvalidArgument( - "The elements in input tensor must be non-negative ints")); - - int64_t output_size = static_cast(*std::max_element( - input_data, input_data + input_numel)) + - 1L; - output_size = std::max(output_size, static_cast(minlength)); - - framework::DDim out_dim{output_size}; - output->Resize(out_dim); - - bool has_weights = (weights != nullptr); - - if (has_weights) { - const T* weights_data = weights->data(); - const auto& weights_type = framework::TransToProtoVarType(weights->dtype()); - if (weights_type == framework::proto::VarType::FP32) { - float* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - for (int64_t i = 0; i < input_numel; i++) { - output_data[input_data[i]] += static_cast(weights_data[i]); - } - } else { - double* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - for (int64_t i = 0; i < input_numel; i++) { - output_data[input_data[i]] += static_cast(weights_data[i]); - } - } - - } else { - int64_t* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, 0L); - for (int64_t i = 0; i < input_numel; i++) { - output_data[input_data[i]] += 1L; - } - } -} - -template -class BincountKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("X"); - const auto& input_type = framework::TransToProtoVarType(input->dtype()); - - if (input_type == framework::proto::VarType::INT32) { - BincountInner(context); - } else if (input_type == framework::proto::VarType::INT64) { - BincountInner(context); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc index 27b1107675d4e722f9a2e25801ecc4dfb206cce5..1063a8b7992153dbedcdc0442ac3d8038c5e171b 100644 --- a/paddle/fluid/operators/broadcast_tensors_op.cc +++ b/paddle/fluid/operators/broadcast_tensors_op.cc @@ -12,15 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/broadcast_tensors_op.h" - -#include -#include -#include -#include -#include - +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/var_type_inference.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -31,64 +27,6 @@ class BroadcastTensorsOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "broadcast_tensors"); - OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", - "broadcast_tensors"); - - int target_rank = 0; - const auto& input_dims = ctx->GetInputsDim("X"); - - // 1. Find Output rank = max(Inputs rank) - for (const auto& input_ddim : input_dims) { - target_rank = std::max(target_rank, input_ddim.size()); - } - - PADDLE_ENFORCE_GT( - target_rank, 0, - platform::errors::InvalidArgument( - "BroadcastTensorsOp requires at least one input tensor" - "to have rank greater than zero")); - - std::vector target_dims(target_rank, 0); - // 2. Output dim(axis=x) = max(Inputs dim(axis=x)) - for (int index = 0; index < target_rank; index++) { - // Loop axes in reverse order, - // For each axis, take the maximum as target size - // Fill size = 1 if shape vector exhausts - int target_dim_size = 1; - for (const auto& input_ddim : input_dims) { - // Reversed order - int axis = static_cast(input_ddim.size()) - index - 1; - int dim_size = 1; - if (axis >= 0) { - dim_size = input_ddim[axis]; - } - - if (target_dim_size != 1 && dim_size != 1 && - target_dim_size != dim_size) { - PADDLE_THROW(platform::errors::InvalidArgument( - "BroadcastTensorsOp inputs does not satisfy bcast semantics," - "Please check axis = %d in reverse order", - index)); - } - - // We performed bcast semantics check at python level - // So input tensors should all have legal shape - target_dim_size = std::max(target_dim_size, dim_size); - } - target_dims[target_rank - index - 1] = target_dim_size; - } - - // 3. Set Output Dim - std::vector output_ddims; - for (size_t i = 0; i < input_dims.size(); i++) { - output_ddims.emplace_back(phi::make_ddim(target_dims)); - } - ctx->SetOutputsDim("Out", output_ddims); - ctx->ShareAllLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -229,34 +167,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer, namespace ops = paddle::operators; namespace plat = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(broadcast_tensors, + BroadcastTensorsInferShapeFunctor, + PD_INFER_META(phi::BroadcastTensorsInferMeta)); + REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp, ops::BroadcastTensorsOpMaker, ops::BroadcastTensorsGradOpMaker, ops::BroadcastTensorsGradOpMaker, - ops::BroadcastTensorsOpVarTypeInference); + ops::BroadcastTensorsOpVarTypeInference, + BroadcastTensorsInferShapeFunctor); REGISTER_OPERATOR(broadcast_tensors_grad, ops::BroadcastTensorsGradOp, ops::BroadcastTensorsGradOpVarTypeInference, ops::BroadcastTensorsGradNoNeedBufVarsInferer); - -REGISTER_OP_CPU_KERNEL( - broadcast_tensors, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel); - -REGISTER_OP_CPU_KERNEL( - broadcast_tensors_grad, - ops::BroadcastTensorsGradOpKernel, - ops::BroadcastTensorsGradOpKernel, - ops::BroadcastTensorsGradOpKernel, - ops::BroadcastTensorsGradOpKernel, - ops::BroadcastTensorsGradOpKernel); diff --git a/paddle/fluid/operators/broadcast_tensors_op.cu b/paddle/fluid/operators/broadcast_tensors_op.cu deleted file mode 100644 index 5882258317d7daa6c62905f8a76d5c68060787a8..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/broadcast_tensors_op.cu +++ /dev/null @@ -1,122 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/broadcast_tensors_op.h" - -#include -#include -#include -#include -#include - -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" - -namespace paddle { -namespace operators { - -using framework::Tensor; -using framework::DDim; - -template -class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - // Find reduce dimensions - const auto& in_tensors = - context.MultiInput(framework::GradVarName("Out")); - auto out_tensors = context.MultiOutput(framework::GradVarName("X")); - - size_t num_ins = in_tensors.size(); - - PADDLE_ENFORCE_GT( - num_ins, 1, - platform::errors::InvalidArgument( - "Expected at least 2 input tensors, but only received d%.", - in_tensors.size())); - - PADDLE_ENFORCE_EQ( - num_ins, out_tensors.size(), - platform::errors::InvalidArgument( - "BroadcastTensorsOp expects equal number of inputs and outputs," - "but received: %d inputs v.s %d outputs", - num_ins, out_tensors.size())); - - // For each In-Out tensor pair, - // Prepare and apply broadcast dims array - for (size_t i = 0; i < num_ins; i++) { - auto* input_tensor = in_tensors[i]; - auto* output_tensor = out_tensors[i]; - - const DDim& input_dims = input_tensor->dims(); - const DDim& output_dims = output_tensor->dims(); - - int in_rank = input_dims.size(); - int out_rank = output_dims.size(); - - // Collect reduce_dims - // Example: - // dX = [1,1,1,1] - // dOut = [1,1,1,4] - // - // reduce_dims = [3] // reduce along the broadcasted axis - std::vector reduce_dims_vec; - for (int j = 0; j < in_rank; j++) { - int out_axis = out_rank - j - 1; - int in_axis = in_rank - j - 1; - - if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) { - reduce_dims_vec.push_back(in_axis); - } - } - - bool just_copy = (reduce_dims_vec.size() == 0); - output_tensor->mutable_data(context.GetPlace()); - if (just_copy) { - // Turns out to be a No-Op, simply copy tensors - framework::TensorCopy(*input_tensor, context.GetPlace(), - context.device_context(), output_tensor); - } else { - // reduce_sum implementation on CUDA - auto stream = context.cuda_device_context().stream(); - TensorReduceImpl>( - context.cuda_device_context(), *input_tensor, output_tensor, - kps::IdentityFunctor(), reduce_dims_vec, stream); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - broadcast_tensors, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel); - -REGISTER_OP_CUDA_KERNEL(broadcast_tensors_grad, - ops::CUDABroadcastTensorsGradOpKernel, - ops::CUDABroadcastTensorsGradOpKernel, - ops::CUDABroadcastTensorsGradOpKernel, - ops::CUDABroadcastTensorsGradOpKernel, - ops::CUDABroadcastTensorsGradOpKernel); diff --git a/paddle/fluid/operators/broadcast_tensors_op.h b/paddle/fluid/operators/broadcast_tensors_op.h deleted file mode 100644 index 682f2e24769221d04317d0e53d02406c4c5a26eb..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/broadcast_tensors_op.h +++ /dev/null @@ -1,282 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -#define SWITCH_OUT_RANK_CASE(n) \ - case n: { \ - ApplyBroadcast(context, in_tensors[i], out_tensors[i]); \ - break; \ - } - -namespace paddle { -namespace operators { - -using framework::Tensor; -using framework::DDim; -using framework::EigenTensor; - -template -class BroadcastTensorsOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const auto& in_tensors = context.MultiInput("X"); - auto out_tensors = context.MultiOutput("Out"); - - size_t num_ins = in_tensors.size(); - - PADDLE_ENFORCE_GT( - num_ins, 1, - platform::errors::InvalidArgument( - "Expected at least 2 input tensors, but only received d%.", - in_tensors.size())); - - PADDLE_ENFORCE_EQ( - num_ins, out_tensors.size(), - platform::errors::InvalidArgument( - "BroadcastTensorsOp expects equal number of inputs and outputs," - "but received: %d inputs v.s %d outputs", - num_ins, out_tensors.size())); - - // Eigen has no support for dynamic ranked tensor - // Thus we perform static expansion for each possible ranks - for (size_t i = 0; i < num_ins; i++) { - int out_rank = out_tensors[i]->dims().size(); - switch (out_rank) { - SWITCH_OUT_RANK_CASE(1) - SWITCH_OUT_RANK_CASE(2) - SWITCH_OUT_RANK_CASE(3) - SWITCH_OUT_RANK_CASE(4) - SWITCH_OUT_RANK_CASE(5) - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Target tensor rank out of range" - "Maximum supported rank for broadcast is: 5")); - } - } - } - } - - template - void ApplyBroadcast(const framework::ExecutionContext& context, - const Tensor* input_tensor, Tensor* output_tensor) const { - const auto& input_dims = input_tensor->dims(); - const auto& output_dims = output_tensor->dims(); - - int in_rank = input_dims.size(); - int out_rank = output_dims.size(); - - // 1. Collect bcast_dims, each element of which indicates how many - // times we need to replicate along the corresponding dimension - // 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for - // both input and output tensors, so we need to initialize input X with - // expanded dims: "new_input_dims_vec" - Eigen::DSizes bcast_dims; - std::vector new_input_dims_vec(out_rank); - for (int j = 0; j < out_rank; j++) { - int out_axis = out_rank - j - 1; - int in_axis = in_rank - j - 1; - - bcast_dims[out_axis] = output_dims[out_axis]; - new_input_dims_vec[out_axis] = 1; - if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) { - bcast_dims[out_axis] = 1; - new_input_dims_vec[out_axis] = input_dims[in_axis]; - } - } - auto new_input_dims = phi::make_ddim(new_input_dims_vec); - - // Initialize input X with new_input_dims_vec, so it's rank-aligned with the - // output - auto x = EigenTensor::From(*input_tensor, new_input_dims); - - output_tensor->mutable_data(context.GetPlace()); - auto y = EigenTensor::From(*output_tensor, output_dims); - - auto& place = - *context.template device_context().eigen_device(); - EigenBroadcast, T, OutRank>::Eval(place, y, x, - bcast_dims); - } -}; - -#define SWITCH_RESHAPE_DIMS(n) \ - case n: { \ - Eigen::DSizes reshape_dims; \ - for (size_t i = 0; i < reshape_dims_vec.size(); ++i) { \ - reshape_dims[i] = reshape_dims_vec[i]; \ - } \ - dX.device(place) = \ - dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \ - break; \ - } - -#define UPPER_SWITCH_REDUCE_DIMS(m) \ - case m: { \ - Eigen::DSizes reduce_dims; \ - for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \ - reduce_dims[i] = reduce_dims_vec[i]; \ - } \ - switch (reshape_size) { -#define LOWER_SWITCH_REDUCE_DIMS \ - default: { \ - PADDLE_THROW(platform::errors::InvalidArgument( \ - "Detected reshape size: %d out of range" \ - "Minimum value should be larger than reduce size %d" \ - "While maximum supported is: 5", \ - reshape_size, reduce_size)); \ - } \ - } \ - break; \ - } - -/* ----- GradOpKernel ----- */ -template -class BroadcastTensorsGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - // Find reduce dimensions - const auto& in_tensors = - context.MultiInput(framework::GradVarName("Out")); - auto out_tensors = context.MultiOutput(framework::GradVarName("X")); - - size_t num_ins = in_tensors.size(); - - PADDLE_ENFORCE_GT( - num_ins, 1, - platform::errors::InvalidArgument( - "Expected at least 2 input tensors, but only received d%.", - in_tensors.size())); - - PADDLE_ENFORCE_EQ( - num_ins, out_tensors.size(), - platform::errors::InvalidArgument( - "BroadcastTensorsOp expects equal number of inputs and outputs," - "but received: %d inputs v.s %d outputs", - num_ins, out_tensors.size())); - - // For each In-Out tensor pair, - // Prepare and apply broadcast dims array - for (size_t i = 0; i < num_ins; i++) { - const auto* input_tensor = in_tensors[i]; - auto* output_tensor = out_tensors[i]; - - const auto& input_dims = input_tensor->dims(); - const auto& output_dims = output_tensor->dims(); - - int in_rank = input_dims.size(); - int out_rank = output_dims.size(); - - // BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes - // Here we perform the following Eigen operations: - // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) -> - // reshape(dX_shape) -> dX - // Note the last "reshape(dX_shape)" will be performed implicitly, - // and we only need to collect reduce_dims and reshape_dims - std::vector reduce_dims_vec; - std::vector reshape_dims_vec; - for (int j = 0; j < in_rank; j++) { - int out_axis = out_rank - j - 1; - int in_axis = in_rank - j - 1; - - reshape_dims_vec.push_back(input_dims[j]); - if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) { - reduce_dims_vec.push_back(in_axis); - } - } - - size_t reduce_size = reduce_dims_vec.size(); - size_t reshape_size = reshape_dims_vec.size(); - bool just_copy = (reduce_dims_vec.size() == 0); - output_tensor->mutable_data(context.GetPlace()); - if (just_copy) { - // If this turns out to be a No-Op, simply perform a tensor copy - framework::TensorCopy(*input_tensor, context.GetPlace(), - context.device_context(), output_tensor); - } else { - PADDLE_ENFORCE_GE(reduce_dims_vec.size(), 1, - platform::errors::InvalidArgument( - "The number of dimensions of the input " - "'Out@GRAD' for Op(broadcast_tensors)" - " must be greater than or equal to 1, but " - "the value received is %d.", - reduce_dims_vec.size())); - PADDLE_ENFORCE_LE( - reduce_dims_vec.size(), 5, - platform::errors::InvalidArgument( - "The number of dimensions of the input 'Out@GRAD' " - "for Op(broadcast_tensors) must be less than or equal " - "to 5, but the value received is %d.", - reduce_dims_vec.size())); - - // Overall: - // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) -> - // reshape(dX_shape) -> dX - auto dX = framework::EigenVector::Flatten(*output_tensor); - auto dOut = framework::EigenVector::Flatten(*input_tensor); - auto& place = - *context.template device_context().eigen_device(); - - // Expand ReduceSize and ReshapeSize into static values - switch (reduce_size) { - UPPER_SWITCH_REDUCE_DIMS(1) - SWITCH_RESHAPE_DIMS(1) - SWITCH_RESHAPE_DIMS(2) - SWITCH_RESHAPE_DIMS(3) - SWITCH_RESHAPE_DIMS(4) - SWITCH_RESHAPE_DIMS(5) - LOWER_SWITCH_REDUCE_DIMS - - UPPER_SWITCH_REDUCE_DIMS(2) - SWITCH_RESHAPE_DIMS(2) - SWITCH_RESHAPE_DIMS(3) - SWITCH_RESHAPE_DIMS(4) - SWITCH_RESHAPE_DIMS(5) - LOWER_SWITCH_REDUCE_DIMS - - UPPER_SWITCH_REDUCE_DIMS(3) - SWITCH_RESHAPE_DIMS(3) - SWITCH_RESHAPE_DIMS(4) - SWITCH_RESHAPE_DIMS(5) - LOWER_SWITCH_REDUCE_DIMS - - UPPER_SWITCH_REDUCE_DIMS(4) - SWITCH_RESHAPE_DIMS(4) - SWITCH_RESHAPE_DIMS(5) - LOWER_SWITCH_REDUCE_DIMS - - UPPER_SWITCH_REDUCE_DIMS(5) - SWITCH_RESHAPE_DIMS(5) - LOWER_SWITCH_REDUCE_DIMS - - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Detected reduce size: %d out of range" - "While maximum supported is: 5", - reduce_size)); - } - } - } - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu index 5c7dd0e2561fa41313b2e65a443a9e4913a39961..eb51215790bbcdbc9e7d0c3adad482d9a69324b9 100644 --- a/paddle/fluid/operators/cast_op.cu +++ b/paddle/fluid/operators/cast_op.cu @@ -29,9 +29,5 @@ using CUDA = paddle::platform::CUDADeviceContext; ops::CastOpKernel>, \ ops::CastOpKernel>, ##__VA_ARGS__); -#if !defined(PADDLE_WITH_HIP) // See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc REGISTER_CAST_CUDA_BASE(transfer_dtype, ops::CastOpKernel) -#else -REGISTER_CAST_CUDA_BASE(transfer_dtype) -#endif diff --git a/paddle/fluid/operators/cholesky_op.cc b/paddle/fluid/operators/cholesky_op.cc index 09e915a6bafd4a8b72f35995b3ebbfeafa00476a..ed80ac076c0af7fc8922f095d4be4613bc5057ec 100644 --- a/paddle/fluid/operators/cholesky_op.cc +++ b/paddle/fluid/operators/cholesky_op.cc @@ -90,8 +90,8 @@ class CholeskyGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(cholesky, CholeskyInferShapeFunctor, - PT_INFER_META(phi::CholeskyInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(cholesky, CholeskyInferShapeFunctor, + PD_INFER_META(phi::CholeskyInferMeta)); REGISTER_OPERATOR(cholesky, ops::CholeskyOp, ops::CholeskyOpMaker, ops::CholeskyGradOpMaker, ops::CholeskyGradOpMaker, diff --git a/paddle/fluid/operators/cholesky_solve_op.cc b/paddle/fluid/operators/cholesky_solve_op.cc index 6b5bae8fc73fe2b71212a93144d89144dd0268c6..5403e2440ee58f1cf7cbad107f4d3e174655ed3b 100644 --- a/paddle/fluid/operators/cholesky_solve_op.cc +++ b/paddle/fluid/operators/cholesky_solve_op.cc @@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/cholesky_solve_op.h" -#include "paddle/fluid/operators/solve_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -39,50 +40,6 @@ class CholeskySolveOpMaker : public framework::OpProtoAndCheckerMaker { class CholeskySolveOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *context) const override { - OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "CholeskySolve"); - OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", "CholeskySolve"); - OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "CholeskySolve"); - auto u_dims = context->GetInputDim("Y"); - auto b_dims = context->GetInputDim("X"); - int u_rank = u_dims.size(); - int b_rank = b_dims.size(); - PADDLE_ENFORCE_GE(u_rank, 2, - platform::errors::InvalidArgument( - "the rank of input Y must greater or equal to 2")); - PADDLE_ENFORCE_GE(b_rank, 2, - platform::errors::InvalidArgument( - "the rank of input X must greater or equal to 2")); - PADDLE_ENFORCE_EQ(u_dims[u_rank - 1], u_dims[u_rank - 2], - platform::errors::InvalidArgument( - "input Matrix Y should be square matrix," - "But Got last shape of %ld x %ld", - u_dims[u_rank - 1], u_dims[u_rank - 2])); - PADDLE_ENFORCE_EQ( - b_dims[b_rank - 2], u_dims[u_rank - 2], - platform::errors::InvalidArgument( - "the first dim of input X must equal to the dim of input Y," - "But Got %ld and %ld", - b_dims[b_rank - 2], u_dims[u_rank - 2])); - - std::vector u_dims_vec = phi::vectorize(u_dims); - std::vector b_dims_vec = phi::vectorize(b_dims); - - std::vector u_dims_vec_cut(u_dims_vec.begin(), - u_dims_vec.end() - 2); - std::vector b_dims_vec_cut(b_dims_vec.begin(), - b_dims_vec.end() - 2); - - std::vector expand_batch_portion = - get_broadcast_batch_portion(u_dims_vec_cut, b_dims_vec_cut); - - std::vector b_broadcast_dims({expand_batch_portion}); - b_broadcast_dims.insert(b_broadcast_dims.end(), - {b_dims_vec[b_rank - 2], b_dims_vec[b_rank - 1]}); - - // dim of 'Out' is the same with 'Y' after broadcast - context->SetOutputDim("Out", phi::make_ddim(b_broadcast_dims)); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -151,22 +108,15 @@ class CholeskySolveGradOp : public framework::OperatorWithKernel { } // namespace operators } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(cholesky_solve, CholeskySolveInferShapeFunctor, + PD_INFER_META(phi::CholeskySolveInferMeta)); + REGISTER_OPERATOR(cholesky_solve, ops::CholeskySolveOp, ops::CholeskySolveOpMaker, ops::CholeskySolveOpVarTypeInference, ops::CholeskySolveOpGradMaker, - ops::CholeskySolveOpGradMaker); + ops::CholeskySolveOpGradMaker, + CholeskySolveInferShapeFunctor); REGISTER_OPERATOR(cholesky_solve_grad, ops::CholeskySolveGradOp); - -REGISTER_OP_CPU_KERNEL( - cholesky_solve, - ops::CholeskySolveKernel, - ops::CholeskySolveKernel); - -REGISTER_OP_CPU_KERNEL( - cholesky_solve_grad, - ops::CholeskySolveGradKernel, - ops::CholeskySolveGradKernel); -// Complex<> is not supported because of TensorExpand, which used to boardcast -// input Tensor diff --git a/paddle/fluid/operators/cholesky_solve_op.cu b/paddle/fluid/operators/cholesky_solve_op.cu deleted file mode 100644 index 1b551a7cd0343db32a84e962212a25e1ff5a4893..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/cholesky_solve_op.cu +++ /dev/null @@ -1,136 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef PADDLE_WITH_HIP -// HIP not support cusolver - -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/operators/cholesky_solve_op.h" -#include "paddle/fluid/platform/dynload/cusolver.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using CUDADeviceContext = paddle::platform::CUDADeviceContext; - -template -void cusolver_potrs(const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, - int n, int nrhs, T *Adata, int lda, T *Bdata, int ldb, - int *devInfo); - -template <> -void cusolver_potrs(const cusolverDnHandle_t &cusolverH, - cublasFillMode_t uplo, int n, int nrhs, float *Adata, - int lda, float *Bdata, int ldb, int *devInfo) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSpotrs( - cusolverH, uplo, n, nrhs, Adata, lda, Bdata, ldb, devInfo)); -} - -template <> -void cusolver_potrs(const cusolverDnHandle_t &cusolverH, - cublasFillMode_t uplo, int n, int nrhs, - double *Adata, int lda, double *Bdata, int ldb, - int *devInfo) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDpotrs( - cusolverH, uplo, n, nrhs, Adata, lda, Bdata, ldb, devInfo)); -} - -template <> -void cusolver_potrs>( - const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, int n, int nrhs, - platform::complex *Adata, int lda, platform::complex *Bdata, - int ldb, int *devInfo) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnCpotrs( - cusolverH, uplo, n, nrhs, reinterpret_cast(Adata), lda, - reinterpret_cast(Bdata), ldb, devInfo)); -} - -template <> -void cusolver_potrs>( - const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, int n, int nrhs, - platform::complex *Adata, int lda, platform::complex *Bdata, - int ldb, int *devInfo) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnZpotrs( - cusolverH, uplo, n, nrhs, - reinterpret_cast(Adata), lda, - reinterpret_cast(Bdata), ldb, devInfo)); -} - -template -class CholeskySolveFunctor { - public: - void operator()(const platform::CUDADeviceContext &dev_ctx, bool upper, int n, - int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) { - cublasFillMode_t uplo = - upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER; - - /* step 1: get cusolver handle*/ - auto cusolverH = dev_ctx.cusolver_dn_handle(); - - /* step 2: solve A0*X0 = B0 */ - cusolver_potrs(cusolverH, uplo, n, nrhs, Adata, lda, Bdata, lda, - devInfo); - - PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); - } -}; - -template -class MatrixReduceSumFunctor { - public: - void operator()(const Tensor &in, Tensor *out, - const framework::ExecutionContext &ctx) { - // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3] - // out_reduce_dim should be [0, 2] - const std::vector in_dims = phi::vectorize(in.dims()); - auto in_size = in_dims.size(); - const std::vector out_dims = phi::vectorize(out->dims()); - auto out_size = out_dims.size(); - - std::vector out_bst_dims(in_size); - - std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1); - std::copy(out_dims.data(), out_dims.data() + out_size, - out_bst_dims.data() + in_size - out_size); - - std::vector out_reduce_dims; - for (size_t idx = 0; idx <= in_size - 3; idx++) { - if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) { - out_reduce_dims.push_back(idx); - } - } - gpuStream_t stream = ctx.cuda_device_context().stream(); - TensorReduceImpl>( - ctx.cuda_device_context(), in, out, kps::IdentityFunctor(), - out_reduce_dims, stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - cholesky_solve, - ops::CholeskySolveKernel, - ops::CholeskySolveKernel); - -REGISTER_OP_CUDA_KERNEL( - cholesky_solve_grad, - ops::CholeskySolveGradKernel, - ops::CholeskySolveGradKernel); - -#endif // not PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h deleted file mode 100644 index f25fbbb0c698036951c4b9ae8e9ad2778786a1a2..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/cholesky_solve_op.h +++ /dev/null @@ -1,248 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/solve_op.h" -#include "paddle/fluid/operators/svd_helper.h" -#include "paddle/fluid/operators/triangular_solve_op.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/phi/kernels/funcs/lapack/lapack_function.h" -#include "paddle/phi/kernels/math_kernel.h" - -namespace paddle { -namespace operators { // namespace operators - -template -class CholeskySolveFunctor { - public: - void operator()(const platform::DeviceContext &dev_ctx, bool upper, int n, - int nrhs, T *Adata, int lda, T *Bdata, int *devInfo); -}; - -template -class CholeskySolveFunctor { - public: - void operator()(const platform::CPUDeviceContext &dev_ctx, bool upper, int n, - int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) { - char uplo = upper ? 'U' : 'L'; - phi::funcs::lapackCholeskySolve(uplo, n, nrhs, Adata, lda, Bdata, lda, - devInfo); - } -}; - -template -void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx, - const framework::Tensor &uin, - const framework::Tensor &bin, framework::Tensor *out, - bool upper) { - const auto &dev_ctx = ctx.template device_context(); - // framework::Tensor broadcast - std::vector u_bst_dims_vec; - std::vector b_bst_dims_vec; - std::tie(u_bst_dims_vec, b_bst_dims_vec) = get_broadcast_dims(uin, bin); - framework::Tensor u_bst(uin.type()); - TensorExpand(dev_ctx, uin, &u_bst, u_bst_dims_vec); - - framework::Tensor b_bst(bin.type()); - TensorExpand(dev_ctx, bin, &b_bst, b_bst_dims_vec); - - math::DeviceIndependenceTensorOperations helper(ctx); - - // calculate u's conjugate for complex - framework::Tensor u_conj(u_bst.type()); - platform::ForRange u_for_range(dev_ctx, u_bst.numel()); - phi::funcs::ConjFunctor u_functor( - u_bst.data(), u_bst.numel(), - u_conj.mutable_data(u_bst.dims(), dev_ctx.GetPlace())); - u_for_range(u_functor); - u_conj = helper.Transpose(u_conj); - - // calculate b's conjugate for complex - framework::Tensor b_conj(b_bst.type()); - platform::ForRange b_for_range(dev_ctx, b_bst.numel()); - phi::funcs::ConjFunctor b_functor( - b_bst.data(), b_bst.numel(), - b_conj.mutable_data(b_bst.dims(), dev_ctx.GetPlace())); - b_for_range(b_functor); - b_conj = helper.Transpose(b_conj); - - auto ut_data = u_conj.mutable_data(dev_ctx.GetPlace()); - auto uindims = u_bst.dims(); - auto bindims = b_bst.dims(); - int uinrank = uindims.size(); - int binrank = bindims.size(); - - int n = uindims[uinrank - 2]; - int nrhs = bindims[binrank - 1]; - int ldab = std::max(1, n); - - // framework::Tensor out_copy(b_conj.type()); - // out_copy.Resize(b_conj.dims()); - framework::TensorCopy(b_conj, dev_ctx.GetPlace(), out); - T *out_data = out->mutable_data(dev_ctx.GetPlace()); - - auto info_dims = phi::slice_ddim(bindims, 0, binrank - 2); - auto batchsize = product(info_dims); - - framework::Tensor tmp; - std::vector tmpdim(1, batchsize); - tmp.Resize(phi::make_ddim(tmpdim)); - int *info = tmp.mutable_data(dev_ctx.GetPlace()); - - CholeskySolveFunctor functor; - for (int b = 0; b < batchsize; b++) { - auto uin_data_item = &ut_data[b * n * n]; - auto out_data_item = &out_data[b * n * nrhs]; - auto info_item = &info[b]; - functor(dev_ctx, upper, n, nrhs, uin_data_item, ldab, out_data_item, - info_item); - } - - // calculate out's conjugate for complex - platform::ForRange out_for_range(dev_ctx, out->numel()); - phi::funcs::ConjFunctor out_functor( - out->data(), out->numel(), - out->mutable_data(out->dims(), dev_ctx.GetPlace())); - out_for_range(out_functor); - *out = helper.Transpose(*out); -} - -template -class CholeskySolveKernel : public framework::OpKernel { - public: - void Compute(const paddle::framework::ExecutionContext &ctx) const override { - auto *uin = ctx.Input("Y"); - auto *bin = ctx.Input("X"); - auto *out = ctx.Output("Out"); - auto upper = ctx.Attr("upper"); - cholesky_solve_fn(ctx, *uin, *bin, out, upper); - } -}; - -template -class CholeskySolveGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *bin = ctx.Input("X"); - auto *uin = ctx.Input("Y"); - auto *out = ctx.Input("Out"); - auto *dout = ctx.Input(framework::GradVarName("Out")); - auto *db = ctx.Output(framework::GradVarName("X")); - auto *du = ctx.Output(framework::GradVarName("Y")); - auto upper = ctx.Attr("upper"); - - const auto &dev_ctx = ctx.template device_context(); - math::DeviceIndependenceTensorOperations helper(ctx); - - std::vector u_bst_dims_vec; - std::vector b_bst_dims_vec; - std::tie(u_bst_dims_vec, b_bst_dims_vec) = get_broadcast_dims(*uin, *bin); - framework::Tensor u_bst(uin->type()); - TensorExpand(dev_ctx, *uin, &u_bst, u_bst_dims_vec); - - framework::Tensor db_bst(bin->type()); - TensorExpand(dev_ctx, *bin, &db_bst, b_bst_dims_vec); - - if (dout) { - db->mutable_data(dev_ctx.GetPlace()); - cholesky_solve_fn(ctx, u_bst, *dout, &db_bst, upper); - - if (db_bst.dims() == db->dims()) { - framework::TensorCopy(db_bst, dev_ctx.GetPlace(), dev_ctx, db); - } else { - MatrixReduceSumFunctor functor; - functor(db_bst, db, ctx); - db->Resize(bin->dims()); - } - - auto blas = phi::funcs::GetBlas(ctx); - - // calculate out's conjugate for complex - framework::Tensor out_conj(out->type()); - platform::ForRange out_for_range(dev_ctx, out->numel()); - phi::funcs::ConjFunctor out_functor( - out->data(), out->numel(), - out_conj.mutable_data(out->dims(), dev_ctx.GetPlace())); - out_for_range(out_functor); - out_conj = helper.Transpose(out_conj); - - framework::Tensor commonterm(out->type()); - auto outdims = out_conj.dims(); - auto dbdims = db_bst.dims(); - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(outdims, 0, false); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(dbdims, 0, false); - auto cmtdim = outdims; - cmtdim[cmtdim.size() - 2] = dbdims[dbdims.size() - 2]; - commonterm.Resize(cmtdim); - commonterm.mutable_data(dev_ctx.GetPlace()); - blas.MatMul(db_bst, mat_dim_b, out_conj, mat_dim_a, static_cast(1), - &commonterm, static_cast(0)); - - // calculate commonterm's conjugate for complex - framework::Tensor commonterm_conj(commonterm.type()); - platform::ForRange commonterm_for_range( - dev_ctx, commonterm.numel()); - phi::funcs::ConjFunctor commonterm_functor( - commonterm.data(), commonterm.numel(), - commonterm_conj.mutable_data(commonterm.dims(), - dev_ctx.GetPlace())); - commonterm_for_range(commonterm_functor); - commonterm_conj = helper.Transpose(commonterm_conj); - - phi::AddRawKernel( - static_cast::TYPE &>(dev_ctx), - commonterm, commonterm_conj, -1, &commonterm); - - auto mat_dim_u = - phi::funcs::CreateMatrixDescriptor(u_bst.dims(), 0, false); - auto mat_dim_c = - phi::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false); - - Tensor du_bst(uin->type()); - // get upper or lower triangular - du_bst.Resize(u_bst.dims()); - du_bst.mutable_data(dev_ctx.GetPlace()); - if (upper) { - blas.MatMul(u_bst, mat_dim_u, commonterm, mat_dim_c, static_cast(-1), - &du_bst, static_cast(0)); - } else { - blas.MatMul(commonterm, mat_dim_c, u_bst, mat_dim_u, static_cast(-1), - &du_bst, static_cast(0)); - } - - const auto &udims = u_bst.dims(); - const auto H = udims[udims.size() - 2]; - const auto W = udims[udims.size() - 1]; - platform::ForRange x_for_range(dev_ctx, u_bst.numel()); - TrilTriuCompute tril_triu_computer(du_bst.data(), 0, !upper, H, W, - u_bst.data()); - x_for_range(tril_triu_computer); - - du->mutable_data(dev_ctx.GetPlace()); - if (u_bst.dims() == du->dims()) { - framework::TensorCopy(u_bst, dev_ctx.GetPlace(), dev_ctx, du); - } else { - MatrixReduceSumFunctor functor; - functor(u_bst, du, ctx); - du->Resize(uin->dims()); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt index f1247ebdf23c8e00cdbfd662a160912a769d7558..2092f65212a6a71534e1ea9a6977abc94bf97b6a 100644 --- a/paddle/fluid/operators/cinn/CMakeLists.txt +++ b/paddle/fluid/operators/cinn/CMakeLists.txt @@ -1,9 +1,9 @@ include(operators) cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context) -cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy parallel_executor cinn) +cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy device_context parallel_executor cinn) -SET(CINN_OP_DEPS string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context) +SET(CINN_OP_DEPS parallel_executor string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context) register_operators(DEPS ${CINN_OP_DEPS}) if (WITH_TESTING) @@ -11,7 +11,7 @@ if (WITH_TESTING) set_tests_properties(cinn_launch_context_test PROPERTIES LABELS "RUN_TYPE=CINN") SET(CINN_RUN_ENVIRONMENT "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda") - cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op elementwise_add_op) + cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op elementwise_add_op gflags) set_tests_properties(cinn_launch_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT "${CINN_RUN_ENVIRONMENT}") cc_test(cinn_instruction_run_op_test SRCS cinn_instruction_run_op_test.cc DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op elementwise_add_op) diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc index 7c4bdc09a569e455b20febef278003ada923dd79..0edbee534c0b5d680717250e7702f272eacd0272 100644 --- a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc @@ -22,11 +22,17 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/init.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP(cinn_launch); USE_OP(cinn_instruction_run); USE_OP_ITSELF(elementwise_add); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +#ifdef PADDLE_WITH_CUDA +PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); +#endif + namespace paddle::operators { using framework::paddle2cinn::CinnCompiler; @@ -50,7 +56,7 @@ TEST(CinnInstructionOpTest, TestWithElementwiseAdd) { auto cinn_instruction_run_op = paddle::framework::OpRegistry::CreateOp( "cinn_instruction_run", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}}, - {{"cached_index", 0}, {"instruction_index", 1}}); + {{"cached_index", 0}, {"instruction_index", 0}}); auto elementwise_add_op = paddle::framework::OpRegistry::CreateOp( "elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {add_op_out_name}}}, {{}}); diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc index 0a21d937aa1a70120e6112cdb291aa41eb222bb3..b76dd60409221eef9204f26319dabb20db4a36ac 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc @@ -31,6 +31,7 @@ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/operators/cinn/cinn_op_helper.h" +#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/core/ddim.h" @@ -90,9 +91,30 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph, // Convert the CINN runtime program to a Paddle graph runtime_graph_ = std::make_unique( BuildCompiledProgram(graph, compiled_obj)); - runtime_graph_->SetNotOwned( - kMemOptVarInfoFromMainGraph, - &graph.Get(kMemOptVarInfoFromMainGraph)); + auto& outer_varinfo = graph.Get(kMemOptVarInfoFromMainGraph); + runtime_graph_->SetNotOwned(kMemOptVarInfoFromMainGraph, + &outer_varinfo); + // collect skip_eager_vars + skip_eager_vars_.reserve(input_var_names.size() + output_var_names.size()); + auto add_skip_var_fn = [&outer_varinfo, this](const std::string& var_name) { + // if a var exists at outer_varinfo map, + // that means it can be erased after graph execution + if (!outer_varinfo.count(var_name)) { + skip_eager_vars_.emplace_back(var_name); + } + }; + std::for_each(input_var_names.begin(), input_var_names.end(), + add_skip_var_fn); + std::for_each(output_var_names.begin(), output_var_names.end(), + add_skip_var_fn); + VLOG(4) << string::Sprintf( + "Distribution of variables in the graph compiled:" + "input[%lu],internal[%lu],output[%lu]," + "outer_eager_deletion[%lu],skip_eager_deletion[%lu]," + "initialized_beforehand[%lu]", + input_var_names.size(), internal_var_names_.size(), + output_var_names.size(), outer_varinfo.size(), skip_eager_vars_.size(), + initialized_beforehand_vars_.size()); } void CinnLaunchContext::BuildVarNameMap( @@ -288,6 +310,7 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram( // are set by values of the corresponding compiled tensors, // including the in/out variables where the equiality between their tensors // and the CINN compiled ones is verified in corresponding cinn_launch_op. + std::unordered_set has_refer_vars; for (auto&& arg : cinn_argument_names_) { const std::string& var_name = cinn2paddle_varmap_.at(arg); framework::VarDesc* var_desc = block->Var(var_name); @@ -298,6 +321,7 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram( auto* ori_desc = res->second; var_desc->SetPersistable(ori_desc->Persistable()); var_desc->SetIsParameter(ori_desc->IsParameter()); + has_refer_vars.insert(var_name); } auto cinn_tensor = GetCinnTensorOfVar(var_name); @@ -331,6 +355,12 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram( auto* ins = instructions.at(ins_idx).get(); auto in_args = trans_and_pack_args_fn(ins->GetInArgs()); auto out_args = trans_and_pack_args_fn(ins->GetOutArgs()); + for (auto&& var_name : in_args) { + if (!has_refer_vars.count(var_name)) { + initialized_beforehand_vars_.emplace_back(var_name); + } + } + has_refer_vars.insert(out_args.begin(), out_args.end()); auto* op_desc = block->AppendOp(); op_desc->SetType("cinn_instruction_run"); @@ -348,16 +378,26 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place, framework::Scope* scope) { if (!parallel_executor_) { framework::details::ExecutionStrategy exec_strategy; + exec_strategy.num_threads_ = 1; + exec_strategy.use_device_ = platform::Place2DeviceType(place); framework::details::BuildStrategy build_strategy; parallel_executor_ = std::make_unique( place, scope, exec_strategy, build_strategy, runtime_graph_.get()); } // update the scope bound to an OpHandle and rebuild temporary variables + VLOG(4) << "Reset scope and initialize temporary variables"; std::unordered_map scope_map = { {parallel_executor_->GetLocalScopes().front(), scope}}; parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map); parallel_executor_->PrepareVariables(scope); + for (auto&& var_name : initialized_beforehand_vars_) { + auto* var = scope->GetVar(var_name); + auto* buffer = GetCinnBufferOfVar(var_name); + auto dim = framework::DDim(buffer->dims, buffer->dimensions); + var->GetMutable()->Resize(dim); + var->GetMutable()->mutable_data(place); + } return parallel_executor_.get(); } diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h index a4d613ea618a886d99344a34ad80aa02e88c10e7..ed5e4383d83d23322860e3f554160013fd5532c9 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.h +++ b/paddle/fluid/operators/cinn/cinn_launch_context.h @@ -86,6 +86,11 @@ class CinnLaunchContext { void CheckTensorEquivalent(const std::string& var_name, const framework::LoDTensor& paddle_tensor); + // Return the name list of variables skipped eager deletion + const std::vector& GetSkipEagerVars() const { + return skip_eager_vars_; + } + // Return internal variable names list const std::unordered_set& GetInternalVarNames() const { return internal_var_names_; @@ -143,6 +148,9 @@ class CinnLaunchContext { std::unordered_set internal_var_names_; // the names of the cinn arguments used in compiled executable program std::unordered_set cinn_argument_names_; + // TODO(CtfGo): remove this list after fixing batch_norm bug + // due to duplicate association in the same variable. + std::vector initialized_beforehand_vars_; // the variable scope compiled from cinn const std::shared_ptr cinn_scope_; @@ -150,6 +158,8 @@ class CinnLaunchContext { std::unique_ptr runtime_graph_; // a ParallelExecutor to execute the runtime graph std::unique_ptr parallel_executor_; + // the name list of skip_eager_vars in runtime + std::vector skip_eager_vars_; // because a cinn_pod_value_t does not own a cinn_buffer_t object, // an extra stroage is necessary to keep those objects and they can diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h index cf3b98c6679b80acad8da69c91addadb9f66ce44..5263aae03ed3f1ab6afa4eb9e6bd38f61858b397 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.h +++ b/paddle/fluid/operators/cinn/cinn_launch_op.h @@ -103,8 +103,8 @@ class CinnLaunchOpKernel : public framework::OpKernel { details::DebugCinnCompiledResult(cinn_compiled_object); auto* launch_context = cinn_compiled_object.launch_context.get(); - // Step 3. Prepare arguments needed for the compiled executable program. - launch_context->UpdateCapturedEnv(scope, place); + // Step 3. check the computational consistency of the subgraph + // before and after the compilation // 3.1 Input variables: tensors of input variables have // been initialized before graph compiled, just check the // equiality between tensors of paddle and cinn. @@ -120,20 +120,15 @@ class CinnLaunchOpKernel : public framework::OpKernel { *inputs_name2tensor.at(var_name)); } - // 3.2 Output variables: the output variables will be initialized - // and allocated buffer in callbacks which are defined in the - // external_malloc/free interface of cinn_buffer_t - // in their corresponding arguments. - // 3.3 Internal variables: A temporary scope is created in - // UpdateCapturedEnv to keep the internal variables and - // they are also initialized through callbacks - // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic. details::SetCinnRuntimeFlags(); - // Step 5. Launch CINN to execute the compiled executable program - VLOG(4) << "Run Cinn compiled executable program with stream: " << stream; - details::LaunchCinnExecution(cinn_compiled_object, *launch_context, stream); + // Step 5. use PE to execute the compiled CINN instructions + // in nodes of the runtime graph + VLOG(4) << "Execute the runtime graph by PE"; + framework::Scope& exec_scope = scope.NewScope(); + auto* pe = launch_context->InitializePE(place, &exec_scope); + pe->RunWithoutFetch(launch_context->GetSkipEagerVars()); VLOG(4) << "CinnLaunchOp launch execution done."; } }; diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc index f5b6161ff3462cc1f12df7f59b4709bf19032df2..585f1caabed051134fd5ce7624c17b741b487ef0 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" @@ -25,9 +26,17 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/init.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP(cinn_launch); +USE_OP(cinn_instruction_run); USE_OP_ITSELF(elementwise_add); +DECLARE_double(eager_delete_tensor_gb); + +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +#ifdef PADDLE_WITH_CUDA +PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); +#endif namespace paddle::operators { @@ -61,6 +70,7 @@ TEST(CinnLaunchOpTest, TestWithElementwiseAdd) { CompareOpResult(scope.GetVar(test_op_out_name), scope.GetVar(add_op_out_name)); }; + FLAGS_eager_delete_tensor_gb = -1; // CPU run_and_check_fn(platform::CPUPlace()); diff --git a/paddle/fluid/operators/collective/c_allgather_op_mlu.cc b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc new file mode 100644 index 0000000000000000000000000000000000000000..f29bc57c9a5f4dbbfd53220ce187b386b3025e55 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc @@ -0,0 +1,81 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allgather_op.h" + +#if defined(PADDLE_WITH_CNCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/mlu/cncl_helper.h" +#endif +#include "paddle/fluid/framework/convert_utils.h" + +namespace paddle { +namespace operators { + +template +class CAllGatherOpMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_CNCL) + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + cnclDataType_t dtype = + platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype())); + + int nranks = ctx.Attr("nranks"); + int rid = ctx.Attr("ring_id"); + auto place = ctx.GetPlace(); + auto comm = platform::CNCLCommContext::Instance().Get(rid, place); + PADDLE_ENFORCE_EQ( + nranks, comm->nranks(), + platform::errors::InvalidArgument("nranks: %s should equal to %s", + nranks, comm->nranks())); + + framework::DDim out_dims = x->dims(); + out_dims[0] *= nranks; + out->mutable_data(out_dims, place); + + uint32_t send_numel = x->numel(); + void* send_buff = reinterpret_cast(const_cast(x->data())); + void* recv_buff = reinterpret_cast(out->data()); + + mluStream stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(send_buff, recv_buff, send_numel, + dtype, comm->comm(), stream)); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with MLU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(c_allgather, ops::CAllGatherOpMLUKernel, + ops::CAllGatherOpMLUKernel, + ops::CAllGatherOpMLUKernel, + ops::CAllGatherOpMLUKernel, + ops::CAllGatherOpMLUKernel, + ops::CAllGatherOpMLUKernel); diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc index c0968581acda9950aaa8ee2b8f3af15e1db59a67..7206dd01bcaa3e588cc275c2fdf25e70aacc1663 100644 --- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc index 31b00a93f1396564907a7872e919ba6c96f666d8..0946ad8aca65e28835ea1d139fb94c309ce840a1 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 7e5120cd2b392b1eb0698727ccebac485193f6d9..2c4e85400ca4adadce5db1fd318ce2273caa201f 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -413,7 +413,7 @@ class CAllReduceOpMLUKernel : public framework::OpKernel { auto place = ctx.GetPlace(); cnclDataType_t dtype = - platform::ToCNCLDataType(framework::TransToProtoVarType(in->type())); + platform::ToCNCLDataType(framework::TransToProtoVarType(in->dtype())); int64_t numel = in->numel(); const void* sendbuff = in->data(); out->Resize(in->dims()); diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc index 9c11704704ed420b14a6ccd9873e0bfbe143b4fe..61e5f27903477972ef10465ccfd6f8de8ce8fba6 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc index d315f211709e4f76c2d5c685721961a91c2102fe..d1e269fb5a4fe9505acf7043bc7a2cea36823ffa 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc @@ -31,7 +31,7 @@ class CBroadcastOPMLUKernel : public framework::OpKernel { auto out = ctx.Output("Out"); int numel = x->numel(); cnclDataType_t dtype = - platform::ToCNCLDataType(framework::TransToProtoVarType(x->type())); + platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype())); int rid = ctx.Attr("ring_id"); auto place = ctx.GetPlace(); diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc index 5787090e6a52f2f37bd504a904108cd1d24caf5f..cf4d6a28744b368212fe8bcb0924001aa53b5a4e 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc index c79b2f92b69a1e6cc5c6f1cf17fa402c671a1997..c4e410d04da5fb5e9b6bfe4d7d5c263084889f54 100644 --- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc index d9a7a4abb08fc883b9b9210fcdefd56af127263a..8b498787c69db0f978acaa68ba63883270e11eb4 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc index b8abf458c1c6d395fef08238abaa114ff5dc6e9e..133085ad3f3b0ffd00dbf4d026687b0311116951 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc index bb78971734bf05e94f7b0ebc1f1540b254f98067..36c6f4fadd0fcc9b06c61d5c45ce6829f2d3d977 100644 --- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc +++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc @@ -27,7 +27,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc index 8f7b8c4a9040be3a2b4540c693c128e92c06a180..6e02d362156970cdee7257c7d00b70cef0519757 100644 --- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc +++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc index c40b2c3e76a02ce6e5e754b2dc4280d6917145e7..57e3dd53cc7748fa0fb66e7e934a1c9cd764a15f 100644 --- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc +++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc @@ -25,7 +25,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index 55de4087f579460fa6080733f3e2f02bb082b015..059fafa3e7f4d4ff0dac7541038d62e03865529f 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -18,7 +18,9 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/infermeta/multiary.h" #include "paddle/phi/kernels/funcs/concat_funcs.h" #ifdef PADDLE_WITH_MKLDNN @@ -33,41 +35,6 @@ class ConcatOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "Concat"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Concat"); - - auto inputs_dims = ctx->GetInputsDim("X"); - - const size_t inputs_num = inputs_dims.size(); - PADDLE_ENFORCE_GT( - inputs_num, static_cast(0), - platform::errors::InvalidArgument( - "The number of input tensors in concat op should > 0. But " - "received inputs' length is 0.")); - if (inputs_num == 1) { - VLOG(3) << "Warning: concat op have only one input, may waste memory"; - } - - if (ctx->HasInput("AxisTensor")) { - auto out_dims = - phi::make_ddim(std::vector(inputs_dims[0].size(), -1)); - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } else { - size_t axis = - ComputeAxis(static_cast(ctx->Attrs().Get("axis")), - static_cast(inputs_dims[0].size())); - framework::DDim out_dims = - phi::funcs::ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims, axis); - if (out_dims[axis] < 0) { - out_dims[axis] = -1; - } - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -237,9 +204,14 @@ class ConcatDoubleGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor, + PD_INFER_META(phi::ConcatInferMeta)); + REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker, ops::ConcatGradOpMaker, - ops::ConcatGradOpMaker); + ops::ConcatGradOpMaker, + ConcatInferShapeFunctor); REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad, ops::ConcatDoubleGradOpMaker, ops::ConcatDoubleGradOpMaker, diff --git a/paddle/fluid/operators/conj_op.cc b/paddle/fluid/operators/conj_op.cc index 95135ba3b1a3db156cd80629296481470b11f937..cbec1182f20b886fb4a77847abf7213aec9990a5 100644 --- a/paddle/fluid/operators/conj_op.cc +++ b/paddle/fluid/operators/conj_op.cc @@ -66,8 +66,8 @@ class ConjGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(conj, ConjInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(conj, ConjInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(conj, ops::ConjOp, ops::ConjOpMaker, ops::ConjGradMaker, ops::ConjGradMaker, diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index 1a2df2a0c7ba34f67ecb7c2ade002fcb4475229f..0c18522fa32eae5f357da062fbd25fa92878cc08 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -19,6 +19,6 @@ else() target_link_libraries(conditional_block_infer_op conditional_block_op) endif() -file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n") -file(APPEND ${pybind_file} "USE_OP(logical_and);\nUSE_OP(logical_or);\nUSE_OP(logical_xor);\nUSE_OP(logical_not);\n") -file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n") +file(APPEND ${pybind_file} "USE_OP_ITSELF(less_than);\nUSE_OP_ITSELF(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n") +file(APPEND ${pybind_file} "USE_OP_ITSELF(logical_and);\nUSE_OP_ITSELF(logical_or);\nUSE_OP_ITSELF(logical_xor);\nUSE_OP_ITSELF(logical_not);\n") +file(APPEND ${pybind_file} "USE_OP_ITSELF(bitwise_and);\nUSE_OP_ITSELF(bitwise_or);\nUSE_OP_ITSELF(bitwise_xor);\nUSE_OP_ITSELF(bitwise_not);\n") diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cc b/paddle/fluid/operators/controlflow/bitwise_op.cc index 55cab03ea9e3f18f36043848914ac11fac1027c9..4dcbbc8568ff18a1313171f8f66f276d77f019a1 100644 --- a/paddle/fluid/operators/controlflow/bitwise_op.cc +++ b/paddle/fluid/operators/controlflow/bitwise_op.cc @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/bitwise_op.h" #include #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" namespace paddle { namespace operators { @@ -75,11 +75,19 @@ It operates ``%s`` on Tensor ``X`` . } }; -class BitwiseOp : public framework::OperatorWithKernel { +template +class UnaryBitwiseOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; protected: + void InferShape(framework::InferShapeContext *context) const override { + OpComment comment; + OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", comment.type); + context->SetOutputDim("Out", context->GetInputDim("X")); + context->ShareLoD("X", "Out"); + } + framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); @@ -90,23 +98,9 @@ class BitwiseOp : public framework::OperatorWithKernel { }; template -class UnaryBitwiseOp : public BitwiseOp { - public: - using BitwiseOp::BitwiseOp; - - protected: - void InferShape(framework::InferShapeContext *context) const override { - OpComment comment; - OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", comment.type); - context->SetOutputDim("Out", context->GetInputDim("X")); - context->ShareLoD("X", "Out"); - } -}; - -template -class BinaryBitwiseOp : public BitwiseOp { +class BinaryBitwiseOp : public framework::OperatorWithKernel { public: - using BitwiseOp::BitwiseOp; + using framework::OperatorWithKernel::OperatorWithKernel; protected: void InferShape(framework::InferShapeContext *context) const override { @@ -130,6 +124,14 @@ class BinaryBitwiseOp : public BitwiseOp { } context->ShareLoD("X", "Out"); } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); + // BitwiseOp kernel's device type is decided by input tensor place + kt.place_ = ctx.Input("X")->place(); + return kt; + } }; } // namespace operators @@ -167,8 +169,3 @@ REGISTER_BINARY_BITWISE_OP(bitwise_and, "Out = X \\& Y"); REGISTER_BINARY_BITWISE_OP(bitwise_or, "Out = X | Y"); REGISTER_BINARY_BITWISE_OP(bitwise_xor, "Out = X ^\\wedge Y"); REGISTER_UNARY_BITWISE_OP(bitwise_not, "Out = \\sim X"); - -REGISTER_BINARY_BITWISE_KERNEL(bitwise_and, CPU, ops::BitwiseAndFunctor); -REGISTER_BINARY_BITWISE_KERNEL(bitwise_or, CPU, ops::BitwiseOrFunctor); -REGISTER_BINARY_BITWISE_KERNEL(bitwise_xor, CPU, ops::BitwiseXorFunctor); -REGISTER_UNARY_BITWISE_KERNEL(bitwise_not, CPU, ops::BitwiseNotFunctor); diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cu b/paddle/fluid/operators/controlflow/bitwise_op.cu deleted file mode 100644 index 5d98da2c027fb6ee681bbea3980f1dbf631d6431..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/controlflow/bitwise_op.cu +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/controlflow/bitwise_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" - -namespace paddle { -namespace operators { - -template -class BinaryBitwiseOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using T = typename Functor::ELEM_TYPE; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - auto functor = Functor(); - std::vector ins = {x, y}; - std::vector outs = {out}; - const auto& cuda_ctx = - ctx.template device_context(); - paddle::operators::LaunchElementwiseCudaKernel(cuda_ctx, ins, &outs, -1, - functor); - } -}; - -template -class UnaryBitwiseOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using T = typename Functor::ELEM_TYPE; - - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - auto functor = Functor(); - std::vector ins = {x}; - std::vector outs = {out}; - const auto& cuda_ctx = - ctx.template device_context(); - paddle::operators::LaunchSameDimsElementwiseCudaKernel(cuda_ctx, ins, - &outs, functor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = ::paddle::operators; -namespace plat = ::paddle::platform; - -REGISTER_BINARY_BITWISE_KERNEL(bitwise_and, CUDA, ops::BitwiseAndFunctor); -REGISTER_BINARY_BITWISE_KERNEL(bitwise_or, CUDA, ops::BitwiseOrFunctor); -REGISTER_BINARY_BITWISE_KERNEL(bitwise_xor, CUDA, ops::BitwiseXorFunctor); -REGISTER_UNARY_BITWISE_KERNEL(bitwise_not, CUDA, ops::BitwiseNotFunctor); diff --git a/paddle/fluid/operators/controlflow/bitwise_op.h b/paddle/fluid/operators/controlflow/bitwise_op.h deleted file mode 100644 index 9e652f92007479684fcf8ec5e539312d8d729107..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/controlflow/bitwise_op.h +++ /dev/null @@ -1,112 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/platform/transform.h" - -namespace paddle { -namespace operators { - -#define BITWISE_BINARY_FUNCTOR(func, expr, bool_expr) \ - template \ - struct Bitwise##func##Functor { \ - using ELEM_TYPE = T; \ - HOSTDEVICE T operator()(const T a, const T b) const { return a expr b; } \ - }; \ - \ - template <> \ - struct Bitwise##func##Functor { \ - using ELEM_TYPE = bool; \ - HOSTDEVICE bool operator()(const bool a, const bool b) const { \ - return a bool_expr b; \ - } \ - }; - -BITWISE_BINARY_FUNCTOR(And, &, &&) -BITWISE_BINARY_FUNCTOR(Or, |, ||) -BITWISE_BINARY_FUNCTOR(Xor, ^, !=) -#undef BITWISE_BINARY_FUNCTOR - -template -struct BitwiseNotFunctor { - using ELEM_TYPE = T; - HOSTDEVICE T operator()(const T a) const { return ~a; } -}; - -template <> -struct BitwiseNotFunctor { - using ELEM_TYPE = bool; - HOSTDEVICE bool operator()(const bool a) const { return !a; } -}; - -template -class BinaryBitwiseOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEM_TYPE; - auto func = Functor(); - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* out = context.Output("Out"); - ElementwiseComputeEx(context, x, y, -1, func, - out); - } -}; - -template -class UnaryBitwiseOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEM_TYPE; - auto func = Functor(); - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - platform::Transform trans; - trans(context.template device_context(), x->data(), - x->data() + x->numel(), out->mutable_data(context.GetPlace()), - func); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = ::paddle::operators; -namespace plat = ::paddle::platform; - -#define REGISTER_BINARY_BITWISE_KERNEL(op_type, dev, functor) \ - REGISTER_OP_##dev##_KERNEL( \ - op_type, \ - ops::BinaryBitwiseOpKernel>, \ - ops::BinaryBitwiseOpKernel>, \ - ops::BinaryBitwiseOpKernel>, \ - ops::BinaryBitwiseOpKernel>, \ - ops::BinaryBitwiseOpKernel>, \ - ops::BinaryBitwiseOpKernel>); - -#define REGISTER_UNARY_BITWISE_KERNEL(op_type, dev, functor) \ - REGISTER_OP_##dev##_KERNEL( \ - op_type, \ - ops::UnaryBitwiseOpKernel>, \ - ops::UnaryBitwiseOpKernel>, \ - ops::UnaryBitwiseOpKernel>, \ - ops::UnaryBitwiseOpKernel>, \ - ops::UnaryBitwiseOpKernel>, \ - ops::UnaryBitwiseOpKernel>); diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cc b/paddle/fluid/operators/controlflow/compare_all_op.cc index ede349f737d899e5f04cb5e35d1dbc0c0abc2403..dd407f4f6f3c51ef99cb09f08ef7fdca5b1e10bc 100644 --- a/paddle/fluid/operators/controlflow/compare_all_op.cc +++ b/paddle/fluid/operators/controlflow/compare_all_op.cc @@ -12,49 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/compare_all_op.h" -#include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { -template -class CompareReduceOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEM_TYPE; - using Tensor = framework::Tensor; - - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* z = context.Output("Out"); - Tensor tmp; - bool* z_data = z->mutable_data(context.GetPlace()); - - if (x->dims() != y->dims()) { - z_data[0] = false; - } else { - tmp.mutable_data(x->dims(), context.GetPlace()); - if (x->numel() == 1 && y->numel() == 1) { - bool* z_data = tmp.mutable_data(context.GetPlace()); - z_data[0] = Functor()(x->data()[0], y->data()[0]); - } else { - ElementwiseComputeEx( - context, x, y, 0, Functor(), &tmp); - } - auto ipt = framework::EigenVector::Flatten(tmp); - auto out = framework::EigenScalar::From(*z); - auto& place = - *context.template device_context() - .eigen_device(); - auto reduce_dim = Eigen::array({{0}}); - out.device(place) = ipt.all(reduce_dim); - } - } -}; - template class CompareReduceOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: @@ -81,26 +46,6 @@ template class CompareReduceOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext* context) const override { - OpComment comment; - PADDLE_ENFORCE_EQ(context->HasInput("X"), true, - platform::errors::InvalidArgument( - "%s operator must have input X", comment.type)); - PADDLE_ENFORCE_EQ(context->HasInput("Y"), true, - platform::errors::InvalidArgument( - "%s operator must have input Y", comment.type)); - auto dim_x = context->GetInputDim("X"); - auto dim_y = context->GetInputDim("Y"); - PADDLE_ENFORCE_GE( - dim_x.size(), dim_y.size(), - platform::errors::InvalidArgument( - "The size of dim_y should not be greater than dim_x's.")); - - context->SetOutputDim("Out", {1}); - context->ShareLoD("X", "Out"); - } }; } // namespace operators @@ -113,25 +58,13 @@ class CompareReduceOp : public framework::OperatorWithKernel { }; \ char _##op_type##Comment::type[]{#op_type}; \ char _##op_type##Comment::equation[]{_equation}; \ + DECLARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor, \ + PD_INFER_META(phi::CompareAllInferMeta)); \ REGISTER_OPERATOR( \ op_type, ::paddle::operators::CompareReduceOp<_##op_type##Comment>, \ ::paddle::operators::CompareReduceOpProtoMaker<_##op_type##Comment>, \ ::paddle::framework::EmptyGradOpMaker, \ - ::paddle::framework::EmptyGradOpMaker); + ::paddle::framework::EmptyGradOpMaker, \ + op_type##_InferShapeFunctor); -#define REGISTER_COMPARE_REDUCE_CPU_KERNEL(op_type, functor) \ - REGISTER_OP_CPU_KERNEL( \ - op_type, ::paddle::operators::CompareReduceOpKernel< \ - ::paddle::platform::CPUDeviceContext, functor>, \ - ::paddle::operators::CompareReduceOpKernel< \ - ::paddle::platform::CPUDeviceContext, functor>, \ - ::paddle::operators::CompareReduceOpKernel< \ - ::paddle::platform::CPUDeviceContext, functor>, \ - ::paddle::operators::CompareReduceOpKernel< \ - ::paddle::platform::CPUDeviceContext, functor>, \ - ::paddle::operators::CompareReduceOpKernel< \ - ::paddle::platform::CPUDeviceContext, functor>); REGISTER_COMPARE_REDUCE_OP(equal_all, "X == Y"); - -REGISTER_COMPARE_REDUCE_CPU_KERNEL(equal_all, - paddle::operators::EqualReduceFunctor); diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cu b/paddle/fluid/operators/controlflow/compare_all_op.cu deleted file mode 100644 index d96dcebe51f97f1a3a954966aeb3663ff1f7a819..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/controlflow/compare_all_op.cu +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/operators/controlflow/compare_all_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" - -namespace paddle { -namespace operators { - -template -struct BitwiseAdd { - // Bitwise add operator, returns a + b - inline T initial() { return static_cast(true); } - - __host__ __device__ __forceinline__ T operator()(const T& a, - const T& b) const { - return a & b; - } -}; - -template -class CompareReduceOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEM_TYPE; - using Tensor = framework::Tensor; - - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* z = context.Output("Out"); - bool* z_data = z->mutable_data(context.GetPlace()); - Tensor tmp; - - if (x->dims() != y->dims()) { - thrust::device_ptr z_dev_ptr(z_data); - thrust::fill(z_dev_ptr, z_dev_ptr + 1, false); - return; - } else { - tmp.mutable_data(x->dims(), context.GetPlace()); - const auto& cuda_ctx = - context.template device_context(); - std::vector ins = {x, y}; - std::vector outs = {&tmp}; - paddle::operators::LaunchSameDimsElementwiseCudaKernel( - cuda_ctx, ins, &outs, Functor()); - - // Reduce by 'bitwise and' operator - std::vector reduce_dims; - reduce_dims.resize(tmp.dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; - auto stream = context.cuda_device_context().stream(); - TensorReduceImpl>( - context.cuda_device_context(), tmp, z, kps::IdentityFunctor(), - reduce_dims, stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -#define REGISTER_COMPARE_REDUCE_CUDA_KERNEL(op_type, functor) \ - REGISTER_OP_CUDA_KERNEL( \ - op_type, \ - ops::CompareReduceOpKernel>, \ - ops::CompareReduceOpKernel>, \ - ops::CompareReduceOpKernel>, \ - ops::CompareReduceOpKernel>, \ - ops::CompareReduceOpKernel>); - -REGISTER_COMPARE_REDUCE_CUDA_KERNEL(equal_all, EqualReduceFunctor) -#undef REGISTER_COMPARE_REDUCE_CUDA_KERNEL diff --git a/paddle/fluid/operators/controlflow/compare_all_op.h b/paddle/fluid/operators/controlflow/compare_all_op.h deleted file mode 100644 index 78a7b76e3fd9d03f2381dfb13f90c191d1dca4f8..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/controlflow/compare_all_op.h +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/platform/transform.h" - -namespace paddle { -namespace operators { - -template -struct EqualReduceFunctor { - using ELEM_TYPE = T; - HOSTDEVICE bool operator()(const T a, const T b) const { - if (std::is_floating_point::value) { - // This branch will be optimized while compiling if T is integer. It is - // safe to cast a and b to double. - return fabs(static_cast(a - b)) < 1e-8; - } else { - return (a == b); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc index 657e74398bb24bb4c2a5514bbb1656126591ee4e..72d81d8c3fdf2827da9b8362cee80ecbb16e4484 100644 --- a/paddle/fluid/operators/controlflow/compare_op.cc +++ b/paddle/fluid/operators/controlflow/compare_op.cc @@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/compare_op.h" -#include -#include -#include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -60,31 +58,6 @@ class CompareOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContext* context) const override { - OpComment comment; - OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", comment.type); - OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", comment.type); - auto dim_x = context->GetInputDim("X"); - auto dim_y = context->GetInputDim("Y"); - - if (context->GetInputDim("X") == context->GetInputDim("Y")) { - context->ShareDim("X", /*->*/ "Out"); - context->ShareLoD("X", /*->*/ "Out"); - } else { - int max_dim = std::max(dim_x.size(), dim_y.size()); - int axis = std::abs(dim_x.size() - dim_y.size()); - std::vector x_dims_array(max_dim); - std::vector y_dims_array(max_dim); - std::vector out_dims_array(max_dim); - GetBroadcastDimsArrays(dim_x, dim_y, x_dims_array.data(), - y_dims_array.data(), out_dims_array.data(), - max_dim, axis); - context->SetOutputDim("Out", phi::make_ddim(out_dims_array)); - // to do - context->ShareLoD("X", /*->*/ "Out"); - } - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); @@ -116,37 +89,31 @@ class CompareOp : public framework::OperatorWithKernel { "In order to force fill output variable to gpu memory.", \ false)); -#define REGISTER_COMPARE_OP(op_type, _equation) \ - struct _##op_type##Comment { \ - static char type[]; \ - static char equation[]; \ - }; \ - char _##op_type##Comment::type[]{#op_type}; \ - char _##op_type##Comment::equation[]{_equation}; \ - REGISTER_OPERATOR( \ - op_type, ::paddle::operators::CompareOp<_##op_type##Comment>, \ - ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \ - ::paddle::framework::EmptyGradOpMaker, \ - ::paddle::framework::EmptyGradOpMaker); \ +#define REGISTER_COMPARE_OP(op_type, _equation) \ + struct _##op_type##Comment { \ + static char type[]; \ + static char equation[]; \ + }; \ + char _##op_type##Comment::type[]{#op_type}; \ + char _##op_type##Comment::equation[]{_equation}; \ + DECLARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor, \ + PD_INFER_META(phi::CompareInferMeta)); \ + REGISTER_OPERATOR( \ + op_type, ::paddle::operators::CompareOp<_##op_type##Comment>, \ + ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \ + ::paddle::framework::EmptyGradOpMaker, \ + ::paddle::framework::EmptyGradOpMaker, \ + op_type##_InferShapeFunctor); \ REGISTER_COMPARE_OP_VERSION(op_type); REGISTER_COMPARE_OP(less_than, "Out = X < Y"); -REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor, - paddle::operators::GreaterThanFunctor); + REGISTER_COMPARE_OP(less_equal, "Out = X <= Y"); -REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor, - paddle::operators::GreaterEqualFunctor); + REGISTER_COMPARE_OP(greater_than, "Out = X > Y"); -REGISTER_COMPARE_KERNEL(greater_than, CPU, - paddle::operators::GreaterThanFunctor, - paddle::operators::LessThanFunctor); + REGISTER_COMPARE_OP(greater_equal, "Out = X >= Y"); -REGISTER_COMPARE_KERNEL(greater_equal, CPU, - paddle::operators::GreaterEqualFunctor, - paddle::operators::LessEqualFunctor); + REGISTER_COMPARE_OP(equal, "Out = X == Y"); -REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor, - paddle::operators::EqualFunctor); + REGISTER_COMPARE_OP(not_equal, "Out = X != Y"); -REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor, - paddle::operators::NotEqualFunctor); diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu deleted file mode 100644 index 4b9452d0f60e0396e4bc50bb5ea56e2f3131098e..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/controlflow/compare_op.cu +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/controlflow/compare_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -namespace paddle { -namespace operators { - -template -class CompareOpKernel - : public framework::OpKernel { - public: - using InT = typename Functor::ELEM_TYPE; - using OutT = bool; - void Compute(const framework::ExecutionContext& ctx) const override { - auto functor = Functor(); - std::vector ins; - std::vector outs; - const auto& cuda_ctx = - ctx.template device_context(); - - int axis = PackTensorsIntoVector(ctx, &ins, &outs); - paddle::operators::LaunchElementwiseCudaKernel( - cuda_ctx, ins, &outs, axis, functor); - } -}; - -} // namespace operators -} // namespace paddle - -#define REGISTER_CUDA_COMPARE_KERNEL(op_type, func) \ - REGISTER_OP_CUDA_KERNEL( \ - op_type, \ - ops::CompareOpKernel, void>, \ - ops::CompareOpKernel, void>, \ - ops::CompareOpKernel, void>, \ - ops::CompareOpKernel, void>, \ - ops::CompareOpKernel, void>, \ - ops::CompareOpKernel, void>); - -REGISTER_CUDA_COMPARE_KERNEL(equal, EqualFunctor) -REGISTER_CUDA_COMPARE_KERNEL(not_equal, NotEqualFunctor) -REGISTER_CUDA_COMPARE_KERNEL(less_than, LessThanFunctor) -REGISTER_CUDA_COMPARE_KERNEL(less_equal, LessEqualFunctor) -REGISTER_CUDA_COMPARE_KERNEL(greater_than, GreaterThanFunctor) -REGISTER_CUDA_COMPARE_KERNEL(greater_equal, GreaterEqualFunctor) -#undef REGISTER_CUDA_COMPARE_KERNEL diff --git a/paddle/fluid/operators/controlflow/compare_op.h b/paddle/fluid/operators/controlflow/compare_op.h deleted file mode 100644 index be017a01ef3237fd8572e248d691daa97c999509..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/controlflow/compare_op.h +++ /dev/null @@ -1,109 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/platform/transform.h" - -namespace paddle { -namespace operators { - -#define COMPARE_FUNCTOR(func_name, op) \ - template \ - struct func_name { \ - using ELEM_TYPE = InT; \ - HOSTDEVICE OutT operator()(const InT a, const InT b) const { \ - return static_cast(a op b); \ - } \ - }; - -COMPARE_FUNCTOR(LessThanFunctor, <) -COMPARE_FUNCTOR(LessEqualFunctor, <=) -COMPARE_FUNCTOR(GreaterThanFunctor, >) -COMPARE_FUNCTOR(GreaterEqualFunctor, >=) -#undef COMPARE_FUNCTOR - -template -struct EqualFunctor { - using ELEM_TYPE = InT; - HOSTDEVICE OutT operator()(const InT a, const InT b) const { - if (std::is_floating_point::value) { - // This branch will be optimized while compiling if T is integer. It is - // safe to cast a and b to double. - return static_cast(fabs(static_cast(a - b)) < 1e-8); - } else { - return static_cast(a == b); - } - } -}; - -template -struct NotEqualFunctor { - using ELEM_TYPE = InT; - HOSTDEVICE bool operator()(const InT a, const InT b) const { - return !EqualFunctor()(a, b); - } -}; - -template -class CompareOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEM_TYPE; - using Tensor = framework::Tensor; - - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* z = context.Output("Out"); - int axis = context.Attr("axis"); - - auto x_dims = x->dims(); - auto y_dims = y->dims(); - if (x_dims.size() >= y_dims.size()) { - ElementwiseComputeEx(context, x, y, axis, - Functor(), z); - } else { - ElementwiseComputeEx( - context, x, y, axis, InverseFunctor(), z); - } - } -}; - -} // namespace operators -} // namespace paddle - -#define REGISTER_COMPARE_KERNEL(op_type, dev, functor, inverse_functor) \ - REGISTER_OP_##dev##_KERNEL(op_type, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, \ - functor, inverse_functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, \ - functor, inverse_functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, \ - functor, inverse_functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, \ - functor, inverse_functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, \ - functor, inverse_functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, \ - functor, inverse_functor>); diff --git a/paddle/fluid/operators/controlflow/compare_op_mlu.cc b/paddle/fluid/operators/controlflow/compare_op_mlu.cc new file mode 100644 index 0000000000000000000000000000000000000000..9dc287ab76a67c6026ec8794793e77179063af3d --- /dev/null +++ b/paddle/fluid/operators/controlflow/compare_op_mlu.cc @@ -0,0 +1,200 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +template +class EqualMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(x->dtype())); + MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(y->dtype())); + MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out->dtype())); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_EQ, input_x.get(), GetBasePtr(x), + input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out)); + } +}; + +template +class NotEqualMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(x->dtype())); + MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(y->dtype())); + MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out->dtype())); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_NE, input_x.get(), GetBasePtr(x), + input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out)); + } +}; + +template +class LessThanMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(x->dtype())); + MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(y->dtype())); + MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out->dtype())); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_LT, input_x.get(), GetBasePtr(x), + input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out)); + } +}; + +template +class LessEqualMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(x->dtype())); + MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(y->dtype())); + MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out->dtype())); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_LE, input_x.get(), GetBasePtr(x), + input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out)); + } +}; + +template +class GreaterThanMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(x->dtype())); + MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(y->dtype())); + MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out->dtype())); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_GT, input_x.get(), GetBasePtr(x), + input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out)); + } +}; + +template +class GreaterEqualMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(x->dtype())); + MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(y->dtype())); + MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out->dtype())); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_GE, input_x.get(), GetBasePtr(x), + input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL( + equal, ops::EqualMLUKernel, + ops::EqualMLUKernel, + ops::EqualMLUKernel, + ops::EqualMLUKernel, + ops::EqualMLUKernel, + ops::EqualMLUKernel, + ops::EqualMLUKernel); + +REGISTER_OP_MLU_KERNEL( + not_equal, ops::NotEqualMLUKernel, + ops::NotEqualMLUKernel, + ops::NotEqualMLUKernel, + ops::NotEqualMLUKernel, + ops::NotEqualMLUKernel, + ops::NotEqualMLUKernel, + ops::NotEqualMLUKernel); + +REGISTER_OP_MLU_KERNEL( + less_than, ops::LessThanMLUKernel, + ops::LessThanMLUKernel, + ops::LessThanMLUKernel, + ops::LessThanMLUKernel, + ops::LessThanMLUKernel, + ops::LessThanMLUKernel, + ops::LessThanMLUKernel); + +REGISTER_OP_MLU_KERNEL( + less_equal, ops::LessEqualMLUKernel, + ops::LessEqualMLUKernel, + ops::LessEqualMLUKernel, + ops::LessEqualMLUKernel, + ops::LessEqualMLUKernel, + ops::LessEqualMLUKernel, + ops::LessEqualMLUKernel); + +REGISTER_OP_MLU_KERNEL( + greater_than, + ops::GreaterThanMLUKernel, + ops::GreaterThanMLUKernel, + ops::GreaterThanMLUKernel, + ops::GreaterThanMLUKernel, + ops::GreaterThanMLUKernel, + ops::GreaterThanMLUKernel, + ops::GreaterThanMLUKernel); + +REGISTER_OP_MLU_KERNEL( + greater_equal, + ops::GreaterEqualMLUKernel, + ops::GreaterEqualMLUKernel, + ops::GreaterEqualMLUKernel, + ops::GreaterEqualMLUKernel, + ops::GreaterEqualMLUKernel, + ops::GreaterEqualMLUKernel, + ops::GreaterEqualMLUKernel); diff --git a/paddle/fluid/operators/controlflow/compare_op_npu.cc b/paddle/fluid/operators/controlflow/compare_op_npu.cc index 7bc4ca09771355361d8106421dc57601b94c88f1..7377d7cf8d312c4f4f405235b21b372b1a7a738c 100644 --- a/paddle/fluid/operators/controlflow/compare_op_npu.cc +++ b/paddle/fluid/operators/controlflow/compare_op_npu.cc @@ -11,7 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" diff --git a/paddle/fluid/operators/controlflow/compare_op_xpu.cc b/paddle/fluid/operators/controlflow/compare_op_xpu.cc index 698bd0516133861523f8d2b353abfeace4665840..2de8b4c9ba880e089bb4eaa4fa8df3bedb69b55b 100644 --- a/paddle/fluid/operators/controlflow/compare_op_xpu.cc +++ b/paddle/fluid/operators/controlflow/compare_op_xpu.cc @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc index a4262d405435ae31c2a5ad681ab443889ec5d393..4d11cb5ff74e69e991271d2a566dbc9344d35da2 100644 --- a/paddle/fluid/operators/controlflow/logical_op.cc +++ b/paddle/fluid/operators/controlflow/logical_op.cc @@ -9,11 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/logical_op.h" #include #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" namespace paddle { namespace operators { @@ -145,15 +145,7 @@ class BinaryLogicalOp : public LogicalOp { ::paddle::framework::EmptyGradOpMaker); REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$"); -REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU, - paddle::operators::LogicalAndFunctor); REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$"); -REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU, - paddle::operators::LogicalOrFunctor); REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$"); -REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU, - paddle::operators::LogicalNotFunctor); REGISTER_BINARY_LOGICAL_OP(logical_xor, "$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$"); -REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU, - paddle::operators::LogicalXorFunctor); diff --git a/paddle/fluid/operators/controlflow/logical_op.cu b/paddle/fluid/operators/controlflow/logical_op.cu deleted file mode 100644 index d88658607ed275808d64dddf4a60d52d4f995e73..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/controlflow/logical_op.cu +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/controlflow/logical_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" - -namespace paddle { -namespace operators { - -template -class BinaryLogicalOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using InT = typename Functor::ELEMENT_TYPE; - using OutT = bool; - - auto functor = Functor(); - std::vector ins; - std::vector outs; - const auto& cuda_ctx = - ctx.template device_context(); - int axis = PackTensorsIntoVector(ctx, &ins, &outs); - - if (ins.size() == 1) { - paddle::operators::LaunchElementwiseCudaKernel( - cuda_ctx, ins, &outs, axis, functor); - } else { - paddle::operators::LaunchElementwiseCudaKernel( - cuda_ctx, ins, &outs, axis, functor); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -#define REGISTER_LOGICAL_CUDA_KERNEL(op_name, func) \ - REGISTER_OP_CUDA_KERNEL( \ - op_name, \ - ops::BinaryLogicalOpKernel>, \ - ops::BinaryLogicalOpKernel>, \ - ops::BinaryLogicalOpKernel>, \ - ops::BinaryLogicalOpKernel>, \ - ops::BinaryLogicalOpKernel>, \ - ops::BinaryLogicalOpKernel>, \ - ops::BinaryLogicalOpKernel>); - -REGISTER_LOGICAL_CUDA_KERNEL(logical_or, LogicalOrFunctor) -REGISTER_LOGICAL_CUDA_KERNEL(logical_and, LogicalAndFunctor) -REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, LogicalXorFunctor) -REGISTER_LOGICAL_CUDA_KERNEL(logical_not, LogicalNotFunctor) -#undef REGISTER_LOGICAL_CUDA_KERNEL diff --git a/paddle/fluid/operators/controlflow/logical_op.h b/paddle/fluid/operators/controlflow/logical_op.h deleted file mode 100644 index 15cd643a858cc018e3007fa90ec479900cd243be..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/controlflow/logical_op.h +++ /dev/null @@ -1,111 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/platform/transform.h" - -namespace paddle { -namespace operators { - -#define LOGICAL_BINARY_FUNCTOR(func_name, op) \ - template \ - struct func_name { \ - using ELEMENT_TYPE = T; \ - HOSTDEVICE bool operator()(const T a, const T b) const { \ - return static_cast(a) op static_cast(b); \ - } \ - }; - -LOGICAL_BINARY_FUNCTOR(LogicalOrFunctor, ||) -LOGICAL_BINARY_FUNCTOR(LogicalAndFunctor, &&) -LOGICAL_BINARY_FUNCTOR(LogicalXorFunctor, ^) -#undef LOGICAL_BINARY_FUNCTOR - -template -struct LogicalNotFunctor { - using ELEMENT_TYPE = T; - HOSTDEVICE bool operator()(const T a) const { return !a; } -}; - -template -class BinaryLogicalOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEMENT_TYPE; - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* out = context.Output("Out"); - Functor binary_func; - ElementwiseComputeEx(context, x, y, -1, - binary_func, out); - } -}; - -template -class UnaryLogicalOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEMENT_TYPE; - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - Functor unary_func; - platform::Transform trans; - trans(context.template device_context(), x->data(), - x->data() + x->numel(), - out->mutable_data(context.GetPlace()), unary_func); - } -}; - -} // namespace operators -} // namespace paddle - -#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \ - REGISTER_OP_##dev##_KERNEL( \ - op_type, ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>); - -#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \ - REGISTER_OP_##dev##_KERNEL( \ - op_type, ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>); diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc index 02f95254035d6041ef64dd746faa924abb053165..c3d7df8d0274371a4c5a482624c75b36677778a9 100644 --- a/paddle/fluid/operators/controlflow/logical_op_npu.cc +++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc @@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/logical_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 3bbb284ca821b8576f2752446555f146c16bb189..4e6fda3d09a071f59c97c87315619d126497a756 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace paddle { namespace operators { @@ -53,12 +54,11 @@ static inline void GetNCDHW(const framework::DDim& dims, } template -static void RemovePaddingSlice(const framework::ExecutionContext& context, +static void RemovePaddingSlice(const phi::GPUContext& context, const Tensor* input, Tensor* out, const std::vector& starts, const std::vector& axes) { - auto& place = - *context.template device_context().eigen_device(); + auto& place = *context.eigen_device(); auto in_dims = input->dims(); auto new_out_dims = out->dims(); auto offsets = Eigen::DSizes(); @@ -171,11 +171,10 @@ void ChooseAlgo(const std::vector& perf_results, using framework::ConvSearchCache; -static void SetConvMathType(const framework::ExecutionContext& ctx, - cudnnDataType_t dtype, +static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype, const platform::ConvolutionDescriptor& cdesc) { #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx; if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( cdesc.desc(), CUDNN_TENSOR_OP_MATH)); @@ -231,8 +230,7 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, - const framework::ExecutionContext& ctx) { + bool deterministic, const phi::GPUContext& ctx) { auto dtype = platform::CudnnDataType::type; bool has_got_workspace_size = true; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; @@ -284,8 +282,7 @@ struct SearchAlgorithm { } else if (deterministic) { algo = static_cast(1); } else { - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx; auto workspace_handle = dev_ctx.cudnn_workspace_handle(); AlgorithmsCache& algo_cache = @@ -346,8 +343,7 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, - const framework::ExecutionContext& ctx) { + bool deterministic, const phi::GPUContext& ctx) { auto dtype = platform::CudnnDataType::type; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; size_t workspace_size = 0; @@ -413,8 +409,7 @@ struct SearchAlgorithm { } else if (deterministic) { return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; } else { - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx; auto workspace_handle = dev_ctx.cudnn_workspace_handle(); AlgorithmsCache& algo_cache = @@ -478,8 +473,7 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, - const framework::ExecutionContext& ctx) { + bool deterministic, const phi::GPUContext& ctx) { platform::CUDAGraphCaptureModeGuard guard; auto dtype = platform::CudnnDataType::type; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; @@ -534,8 +528,7 @@ struct SearchAlgorithm { } else if (deterministic) { return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; } else { - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx; auto workspace_handle = dev_ctx.cudnn_workspace_handle(); AlgorithmsCache& algo_cache = *(framework::ConvSearchCache::Instance().GetBackwardFilter()); diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu deleted file mode 100644 index dff60afd74c02f458b5b3c7428c2703197b61af0..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ /dev/null @@ -1,1476 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the spopecific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/memory/memory.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/operators/conv_miopen_helper.h" -#else -#include "paddle/fluid/operators/conv_cudnn_helper.h" -#endif -#include "paddle/fluid/operators/conv_op.h" -#include "paddle/fluid/operators/math/padding.h" -#include "paddle/fluid/platform/cudnn_workspace_helper.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" - -DECLARE_bool(cudnn_deterministic); -DECLARE_uint64(conv_workspace_size_limit); -DECLARE_bool(cudnn_exhaustive_search); - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; -using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; -using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; -using DataLayout = platform::DataLayout; - -static inline bool IsVoltaOrLater(const platform::CUDADeviceContext& dev_ctx) { - return dev_ctx.GetComputeCapability() >= 70; -} - -template -class CUDNNConvOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - const Tensor* input = ctx.Input("Input"); - auto* filter = ctx.Input("Filter"); - auto* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - const std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - - bool exhaustive_search = - FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") && - ctx.Attr("exhaustive_search")); - bool deterministic = FLAGS_cudnn_deterministic; - auto exhaustive_deterministic = exhaustive_search && deterministic; - PADDLE_ENFORCE_EQ(exhaustive_deterministic, false, - platform::errors::InvalidArgument( - "Cann't set exhaustive_search True and " - "FLAGS_cudnn_deterministic True at same time.")); - - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - auto dtype = platform::CudnnDataType::type; - -#ifdef PADDLE_WITH_HIP - // HIP MIOPEN ONLY SUPPORT NCHW format - auto compute_format = DataLayout::kNCHW; -#else - // Tensor Core introduced from Volta GPUs supports more faster conv op - // with FP16 in NHWC data format. - const bool compute_in_nhwc = - dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx); - // We will only do data format conversion from NHWC to NCHW. - // cudnn will convert NCHW to NHWC automatically on Tensor Core. - auto compute_format = - compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW; -#endif - VLOG(3) << "Compute ConvOp with cuDNN:" - << " data_format=" << data_format << " compute_format=" - << (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW"); - - // ------------ transformed tensor ----------- - Tensor transformed_input_channel(input->type()); - Tensor transformed_output(output->type()); - Tensor transformed_filter_channel(filter->type()); - T* output_data = nullptr; - if (channel_last && compute_format == DataLayout::kNCHW) { - VLOG(3) << "Transform input tensor from NHWC to NCHW."; - ResizeToChannelFirst( - ctx, input, &transformed_input_channel); - TransToChannelFirst( - ctx, input, &transformed_input_channel); - - ResizeToChannelFirst(ctx, output, - &transformed_output); - - } else { - transformed_input_channel.ShareDataWith(*input); - transformed_output.ShareDataWith(*output); - } - if (compute_format == DataLayout::kNHWC) { - VLOG(3) << "Transform filter tensor from NCHW to NHWC."; - ResizeToChannelLast( - ctx, filter, &transformed_filter_channel); - TransToChannelLast( - ctx, filter, &transformed_filter_channel); - } else { - transformed_filter_channel.ShareDataWith(*filter); - } - output_data = transformed_output.data(); - - // update padding and dilation - auto in_dims = transformed_input_channel.dims(); - auto filter_dims = transformed_filter_channel.dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - if (compute_format == DataLayout::kNCHW) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - filter_data_dims = - phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1); - } - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); - - Tensor transformed_input; - std::vector padding_common(data_dim, 0); - if (!is_sys_pad) { - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - new_input_shape_vec[0] = transformed_input_channel.dims()[0]; - - if (compute_format == DataLayout::kNCHW) { - new_input_shape_vec[1] = transformed_input_channel.dims()[1]; - } else { - new_input_shape_vec[data_dim + 1] = - transformed_input_channel.dims()[data_dim + 1]; - } - - std::vector input_pad(transformed_input_channel.dims().size() * 2, - 0); - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - if (compute_format == DataLayout::kNCHW) { - new_input_shape_vec[i + 2] = - transformed_input_channel.dims()[i + 2] + padding_diff[i]; - } else { - new_input_shape_vec[i + 1] = - transformed_input_channel.dims()[i + 1] + padding_diff[i]; - } - if (compute_format == DataLayout::kNCHW) { - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } else { - input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - } - framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); - transformed_input.Resize(new_input_shape); - auto& dev_ctx = - ctx.template device_context(); - - transformed_input = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - const int rank = transformed_input_channel.dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, - &transformed_input); - } break; - case 5: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, - &transformed_input); - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "ConvOp only support tensors with 4 or 5 dimensions.")); - } - - } else { - transformed_input.ShareDataWith(transformed_input_channel); - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - const T* input_data = transformed_input.data(); - const T* filter_data = transformed_filter_channel.data(); - - // ------------------- cudnn descriptors --------------------- - ConvArgs args{&transformed_input, - &transformed_filter_channel, - &transformed_output, - strides, - padding_common, - dilations, - dtype}; - - auto handle = dev_ctx.cudnn_handle(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC - : DataLayout::kNCHW; - if (transformed_input.dims().size() == 5) { - layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC - : DataLayout::kNCDHW; - } - auto layout_format = GetCudnnTensorFormat(layout); - - args.handle = handle; - -#ifdef PADDLE_WITH_HIP - // MIOPEN need to set groups in cdesc in miopen_desc.h - args.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), groups); -#else - args.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn()); -#endif - -#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) - // cudnn 7 can support groups, no need to do it manually - // FIXME(typhoonzero): find a better way to disable groups - // rather than setting it to 1. - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionGroupCount( - args.cdesc.desc(), groups)); - groups = 1; -#endif -#ifdef PADDLE_WITH_HIP - // MIOPEN do not set groups in wdesc after set groups in cdesc - groups = 1; -#endif - args.idesc.set(transformed_input, layout_format); - args.wdesc.set(transformed_filter_channel, layout_format, groups); - args.odesc.set(transformed_output, layout_format); - int i_n, i_c, i_d, i_h, i_w; - int o_n, o_c, o_d, o_h, o_w; - - if (compute_format == DataLayout::kNHWC) { - GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d, - &i_h, &i_w); - GetNCDHW(transformed_output.dims(), DataLayout::kNHWC, &o_n, &o_c, &o_d, - &o_h, &o_w); - } else { - GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, - &i_h, &i_w); - GetNCDHW(transformed_output.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, - &o_h, &o_w); - } - - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = transformed_filter_channel.numel() / groups; - // ------------------- cudnn conv workspace --------------------- - size_t workspace_size = 0; // final workspace to allocate. -// ------------------- cudnn conv algorithm --------------------- -#ifdef PADDLE_WITH_HIP - miopenConvFwdAlgorithm_t algo{}; - using search = SearchAlgorithm; - workspace_size = search::GetWorkspaceSize(args); - algo = search::Find(args, exhaustive_search, deterministic, - workspace_size, ctx); -#else - cudnnConvolutionFwdAlgo_t algo{}; - using search = SearchAlgorithm; - algo = search::Find(args, exhaustive_search, deterministic, ctx); - workspace_size = search::GetWorkspaceSize(args, algo); -#endif - -#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) - // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\ - // FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable - // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\ - // FWD_ALGO_IMPLICIT_GEMM manually. - if (ctx.Attr("groups") > 1) { - algo = static_cast(0); - } -#endif - - // ------------------- cudnn conv forward --------------------- - ScalingParamType alpha = 1.0f; - ScalingParamType beta = 0.0f; - -// NOTE(zhiqiu): inplace addto is not supportted in double grad yet. -// ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : 0.0f; -// VLOG(4) << "Conv: use_addto = " << ctx.Attr("use_addto"); - -#ifdef PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args.idesc.desc(), input_data, - args.wdesc.desc(), filter_data, args.cdesc.desc(), algo, - &beta, args.odesc.desc(), output_data, workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionForward( - handle, &alpha, args.idesc.desc(), - input_data + i * group_offset_in, args.wdesc.desc(), - filter_data + i * group_offset_filter, args.cdesc.desc(), - algo, workspace_ptr, workspace_size, &beta, - args.odesc.desc(), output_data + i * group_offset_out)); - }, - workspace_size); - } -#endif - - if (channel_last && compute_format == DataLayout::kNCHW) { - TransToChannelLast( - ctx, &transformed_output, output); - } - } -}; - -template -class CUDNNConvGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - auto input = ctx.Input("Input"); - auto filter = ctx.Input("Filter"); - auto output_grad = ctx.Input(framework::GradVarName("Output")); - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto filter_grad = ctx.Output(framework::GradVarName("Filter")); - - if (input_grad) { - input_grad->mutable_data(ctx.GetPlace()); - } - if (filter_grad) { - filter_grad->mutable_data(ctx.GetPlace()); - } - - std::vector dilations = ctx.Attr>("dilations"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - int groups = ctx.Attr("groups"); - - bool exhaustive_search = - FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") && - ctx.Attr("exhaustive_search")); - bool deterministic = FLAGS_cudnn_deterministic; - auto exhaustive_deterministic = exhaustive_search && deterministic; - PADDLE_ENFORCE_EQ(exhaustive_deterministic, false, - platform::errors::InvalidArgument( - "Cann't set exhaustive_search True and " - "FLAGS_cudnn_deterministic True at same time.")); - - const std::string data_format = ctx.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - auto dtype = platform::CudnnDataType::type; - -#ifdef PADDLE_WITH_HIP - // HIP MIOPEN ONLY SUPPORT NCHW format - auto compute_format = DataLayout::kNCHW; -#else - const bool compute_in_nhwc = - dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx); - auto compute_format = - compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW; -#endif - VLOG(3) << "Compute ConvGradOp with cuDNN:" - << " data_format=" << data_format << " compute_format=" - << (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW"); - - // transform Tensor - Tensor transformed_input_channel(input->type()); - Tensor transformed_output_grad_channel(output_grad->type()); - Tensor transformed_input_grad_channel(input->type()); - Tensor transformed_filter_channel(filter->type()); - Tensor transformed_filter_grad_channel(filter->type()); - - if (channel_last && compute_format == DataLayout::kNCHW) { - VLOG(3) << "Transform input, output_grad, input_grad and tensor from " - "NHWC to NCHW."; - ResizeToChannelFirst( - ctx, input, &transformed_input_channel); - TransToChannelFirst( - ctx, input, &transformed_input_channel); - - ResizeToChannelFirst( - ctx, output_grad, &transformed_output_grad_channel); - TransToChannelFirst( - ctx, output_grad, &transformed_output_grad_channel); - - if (input_grad) { - ResizeToChannelFirst( - ctx, input_grad, &transformed_input_grad_channel); - // NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy - // the data of input_grad to transformed_input_grad_channel. - if (ctx.HasAttr("use_addto") && ctx.Attr("use_addto")) { - TransToChannelFirst( - ctx, input_grad, &transformed_input_grad_channel); - } - } - } else { - transformed_input_channel.ShareDataWith(*input); - transformed_output_grad_channel.ShareDataWith(*output_grad); - if (input_grad) { - transformed_input_grad_channel.ShareDataWith(*input_grad); - } - } - - if (compute_format == DataLayout::kNHWC) { - VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC."; - ResizeToChannelLast( - ctx, filter, &transformed_filter_channel); - TransToChannelLast( - ctx, filter, &transformed_filter_channel); - - if (filter_grad) { - ResizeToChannelLast( - ctx, filter_grad, &transformed_filter_grad_channel); - } - } else { - transformed_filter_channel.ShareDataWith(*filter); - if (filter_grad) { - transformed_filter_grad_channel.ShareDataWith(*filter_grad); - } - } - - // update paddings - auto in_dims = transformed_input_channel.dims(); - auto filter_dims = transformed_filter_channel.dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - if (compute_format == DataLayout::kNCHW) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - filter_data_dims = - phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1); - } - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - // cuDNN only supports padding the same amount on every dimension. - // So we create a new padded input tensor. - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); - Tensor transformed_input(input->type()); - Tensor transformed_input_grad(input->type()); - std::vector padding_common(data_dim, 0); - std::vector input_pad(transformed_input_channel.dims().size() * 2, 0); - - if (!is_sys_pad) { - // get pad - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - new_input_shape_vec[0] = transformed_input_channel.dims()[0]; - if (compute_format == DataLayout::kNCHW) { - new_input_shape_vec[1] = transformed_input_channel.dims()[1]; - } else { - new_input_shape_vec[data_dim + 1] = - transformed_input_channel.dims()[data_dim + 1]; - } - - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - if (compute_format == DataLayout::kNCHW) { - new_input_shape_vec[i + 2] = - transformed_input_channel.dims()[i + 2] + padding_diff[i]; - } else { - new_input_shape_vec[i + 1] = - transformed_input_channel.dims()[i + 1] + padding_diff[i]; - } - if (compute_format == DataLayout::kNCHW) { - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } else { - input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - } - framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); - transformed_input.Resize(new_input_shape); - - transformed_input_grad.Resize(new_input_shape); - auto& dev_ctx = - ctx.template device_context(); - - transformed_input = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - if (input_grad) { - transformed_input_grad = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - } - // pad for input - const int rank = transformed_input_channel.dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, - &transformed_input); - } break; - case 5: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, - &transformed_input); - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "ConvOp only support tensors with 4 or 5 dimensions.")); - } - } else { - transformed_input.ShareDataWith(transformed_input_channel); - if (input_grad) { - transformed_input_grad.ShareDataWith(transformed_input_grad_channel); - } - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - const T* input_data = transformed_input.data(); - const T* output_grad_data = transformed_output_grad_channel.data(); - const T* filter_data = transformed_filter_channel.data(); - T* filter_grad_data = nullptr; - T* input_grad_data = nullptr; - T* transformed_input_grad_data = nullptr; - - ConvArgs args1{&transformed_input_grad, - &transformed_filter_channel, - &transformed_output_grad_channel, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args2{&transformed_input, - &transformed_filter_grad_channel, - &transformed_output_grad_channel, - strides, - padding_common, - dilations, - dtype}; - - auto handle = dev_ctx.cudnn_handle(); - DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC - : DataLayout::kNCHW; - if (transformed_input.dims().size() == 5) { - layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC - : DataLayout::kNCDHW; - } - auto layout_tensor = GetCudnnTensorFormat(layout); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - - int i_n, i_c, i_d, i_h, i_w; - int o_n, o_c, o_d, o_h, o_w; - if (compute_format == DataLayout::kNHWC) { - GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d, - &i_h, &i_w); - GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNHWC, &o_n, - &o_c, &o_d, &o_h, &o_w); - } else { - GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, - &i_h, &i_w); - GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNCHW, &o_n, - &o_c, &o_d, &o_h, &o_w); - } - - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = transformed_filter_channel.numel() / groups; -// ------------------- cudnn backward algorithm --------------------- -#ifdef PADDLE_WITH_HIP - miopenConvBwdDataAlgorithm_t data_algo = - static_cast(0); - miopenConvBwdWeightsAlgorithm_t filter_algo = - static_cast(0); -#else - cudnnConvolutionBwdDataAlgo_t data_algo = - static_cast(0); - cudnnConvolutionBwdFilterAlgo_t filter_algo = - static_cast(0); -#endif - // input data workspace_size - size_t workspace_size_d = 0; - // weight workspace_size - size_t workspace_size_w = 0; - int iwo_groups = groups; - int c_groups = 1; - -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_groups = 1; - c_groups = groups; - groups = 1; -#endif - - if (input_grad) { - // ------------------- cudnn descriptors --------------------- - input_grad_data = input_grad->data(); - transformed_input_grad_data = transformed_input_grad.data(); - args1.handle = handle; - args1.idesc.set(transformed_input_grad, layout_tensor); - args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups); - args1.odesc.set(transformed_output_grad_channel, layout_tensor); - args1.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_groups); - -#ifdef PADDLE_WITH_HIP - using search1 = SearchAlgorithm; - workspace_size_d = - std::max(workspace_size_d, search1::GetWorkspaceSize(args1)); - data_algo = search1::Find(args1, exhaustive_search, deterministic, - workspace_size_d, ctx); -#else - using search1 = SearchAlgorithm; - data_algo = - search1::Find(args1, exhaustive_search, deterministic, ctx); - workspace_size_d = std::max(workspace_size_d, - search1::GetWorkspaceSize(args1, data_algo)); -#endif - } - - if (filter_grad) { - // ------------------- cudnn descriptors --------------------- - filter_grad_data = transformed_filter_grad_channel.data(); - args2.handle = handle; - args2.idesc.set(transformed_input, layout_tensor); - args2.wdesc.set(transformed_filter_grad_channel, layout_tensor, - iwo_groups); - args2.odesc.set(transformed_output_grad_channel, layout_tensor); - args2.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_groups); -#ifdef PADDLE_WITH_HIP - using search2 = SearchAlgorithm; - workspace_size_w = - std::max(workspace_size_w, search2::GetWorkspaceSize(args2)); - filter_algo = search2::Find(args2, exhaustive_search, deterministic, - workspace_size_w, ctx); -#else - using search2 = SearchAlgorithm; - filter_algo = - search2::Find(args2, exhaustive_search, deterministic, ctx); - workspace_size_w = std::max( - workspace_size_w, search2::GetWorkspaceSize(args2, filter_algo)); -#endif - } - - // ------------------- cudnn conv backward data --------------------- - ScalingParamType alpha = 1.0f; -#ifdef PADDLE_WITH_HIP - // MIOPEN ONLY support beta to be 0.0f - ScalingParamType beta = 0.0f; -#else - ScalingParamType beta = - (ctx.HasAttr("use_addto") && ctx.Attr("use_addto")) ? 1.0f : 0.0f; -#endif - VLOG(4) << "Conv_grad: use_addto = " - << (ctx.HasAttr("use_addto") && ctx.Attr("use_addto")); - - if (input_grad) { -// When beta is 0, it is unnecessary to reset input_grad. -// When beta is 1, the output cannot be reset since addt strategy used. -#ifdef PADDLE_WITH_HIP - if (ctx.HasAttr("use_addto") && ctx.Attr("use_addto")) { - Tensor temp_tensor(transformed_input_grad.type()); - temp_tensor.Resize(transformed_input_grad.dims()); - T* temp_tensor_data = temp_tensor.mutable_data(ctx.GetPlace()); - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args1.odesc.desc(), output_grad_data, - args1.wdesc.desc(), filter_data, args1.cdesc.desc(), - data_algo, &beta, args1.idesc.desc(), temp_tensor_data, - cudnn_workspace_ptr, workspace_size_d)); - }, - workspace_size_d); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor( - handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(), - transformed_input_grad_data, &alpha, args1.idesc.desc(), - temp_tensor_data, &beta, args1.idesc.desc(), - transformed_input_grad_data)); - } else { - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args1.odesc.desc(), output_grad_data, - args1.wdesc.desc(), filter_data, args1.cdesc.desc(), - data_algo, &beta, args1.idesc.desc(), - transformed_input_grad_data, cudnn_workspace_ptr, - workspace_size_d)); - }, - workspace_size_d); - } - -#else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, args1.wdesc.desc(), - filter_data + i * group_offset_filter, args1.odesc.desc(), - output_grad_data + i * group_offset_out, - args1.cdesc.desc(), data_algo, cudnn_workspace_ptr, - workspace_size_d, &beta, args1.idesc.desc(), - transformed_input_grad_data + i * group_offset_in)); - }, - workspace_size_d); - } -#endif - if (!is_sys_pad) { - std::vector starts(transformed_input_channel.dims().size(), 0); - std::vector axes(transformed_input_channel.dims().size(), 0); - - for (size_t i = 0; i < transformed_input_channel.dims().size(); ++i) { - starts[i] = input_pad[2 * i]; - axes[i] = i; - } - - transformed_input_grad_channel.mutable_data(ctx.GetPlace()); - if (transformed_input_channel.dims().size() == 4) { - RemovePaddingSlice( - ctx, &transformed_input_grad, &transformed_input_grad_channel, - starts, axes); - } else { - RemovePaddingSlice( - ctx, &transformed_input_grad, &transformed_input_grad_channel, - starts, axes); - } - } - - if (channel_last && compute_format == DataLayout::kNCHW) { - TransToChannelLast( - ctx, &transformed_input_grad_channel, input_grad); - } - } - - // filter_grad do not use inplace addto. - ScalingParamType beta_filter = 0.0f; - // ------------------- cudnn conv backward filter --------------------- - if (filter_grad) { -// Because beta is zero, it is unnecessary to reset filter_grad. -#ifdef PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardWeights( - handle, &alpha, args2.odesc.desc(), output_grad_data, - args2.idesc.desc(), input_data, args2.cdesc.desc(), - filter_algo, &beta, args2.wdesc.desc(), filter_grad_data, - cudnn_workspace_ptr, workspace_size_w)); - }, - workspace_size_w); -#else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, args2.idesc.desc(), - input_data + i * group_offset_in, args2.odesc.desc(), - output_grad_data + i * group_offset_out, - args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr, - workspace_size_w, &beta_filter, args2.wdesc.desc(), - filter_grad_data + i * group_offset_filter)); - }, - workspace_size_w); - } -#endif - - if (compute_format == DataLayout::kNHWC) { - TransToChannelFirst( - ctx, &transformed_filter_grad_channel, filter_grad); - } - } - } -}; - -/* - * Inputs: I, W, dO, ddI, ddW - * Outputs: ddO, dW, dI - * ddo = conv(ddI, W) + conv(I, ddW) - * dW = conv_bp_filter(ddI, dO) - * dI = conv_bp_data(ddW, dO) - */ -template -class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - auto X = ctx.Input("Input"); - auto W = ctx.Input("Filter"); - auto dO = ctx.Input("DOutput"); - auto ddX = ctx.Input("DDInput"); - auto ddW = ctx.Input("DDFilter"); - - auto ddO = ctx.Output("DDOutput"); - auto dW = ctx.Output("DFilter"); - auto dX = ctx.Output("DInput"); - if (ddO) { - ddO->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, ddO, static_cast(0)); - } - if (dW) { - dW->mutable_data(ctx.GetPlace()); - } - if (dX) { - dX->mutable_data(ctx.GetPlace()); - } - - // const T* x = X->data(); - const T* dy = dO->data(); - const T* w = W->data(); - - const T* ddx = nullptr; - const T* ddw = nullptr; - T *dw, *dx, *ddy; - dw = dx = ddy = nullptr; - T* transformed_dx = nullptr; - const std::vector& strides = ctx.Attr>("strides"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - - bool exhaustive_search = - FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") && - ctx.Attr("exhaustive_search")); - bool deterministic = FLAGS_cudnn_deterministic; - auto exhaustive_deterministic = exhaustive_search && deterministic; - PADDLE_ENFORCE_EQ(exhaustive_deterministic, false, - platform::errors::InvalidArgument( - "Cann't set exhaustive_search True and " - "FLAGS_cudnn_deterministic True at same time.")); - - std::vector paddings = ctx.Attr>("paddings"); - - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensors to channel first----------- - Tensor transformed_X_channel(X->type()); - Tensor transformed_dO_channel(dO->type()); - Tensor transformed_ddX_channel(X->type()); - - Tensor transformed_ddO_channel(dO->type()); - Tensor transformed_dX_channel(X->type()); - - if (channel_last) { - ResizeToChannelFirst( - ctx, X, &transformed_X_channel); - TransToChannelFirst( - ctx, X, &transformed_X_channel); - - ResizeToChannelFirst( - ctx, dO, &transformed_dO_channel); - TransToChannelFirst( - ctx, dO, &transformed_dO_channel); - - if (ddX) { - ResizeToChannelFirst( - ctx, ddX, &transformed_ddX_channel); - TransToChannelFirst( - ctx, ddX, &transformed_ddX_channel); - } - - if (ddO) { - ResizeToChannelFirst( - ctx, ddO, &transformed_ddO_channel); - } - if (dX) { - ResizeToChannelFirst( - ctx, dX, &transformed_dX_channel); - transformed_dX_channel.mutable_data(ctx.GetPlace()); - } - - } else { - transformed_X_channel = *X; - transformed_dO_channel = *dO; - if (ddX) { - transformed_ddX_channel = *ddX; - } - if (ddO) { - transformed_ddO_channel.ShareDataWith(*ddO); - } - if (dX) { - transformed_dX_channel.ShareDataWith(*dX); - } - } - - auto in_dims = transformed_X_channel.dims(); - auto filter_dims = W->dims(); - framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); - Tensor transformed_X(X->type()); - Tensor transformed_ddX(X->type()); - - Tensor transformed_dX(X->type()); - - std::vector padding_common(data_dim, 0); - std::vector input_pad(X->dims().size() * 2, 0); - - if (!is_sys_pad) { - // get pad - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - new_input_shape_vec[0] = transformed_X_channel.dims()[0]; - new_input_shape_vec[1] = transformed_X_channel.dims()[1]; - - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - new_input_shape_vec[i + 2] = - transformed_X_channel.dims()[i + 2] + padding_diff[i]; - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); - transformed_X.Resize(new_input_shape); - transformed_ddX.Resize(new_input_shape); - transformed_dX.Resize(new_input_shape); - - transformed_X = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - if (ddX) { - transformed_ddX = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - } - if (dX) { - transformed_dX = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - } - - // pad for input - const int rank = X->dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - math::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); - if (ddX) { - math::PadFunction( - ctx, input_pad, transformed_ddX_channel, pad_value, - &transformed_ddX); - } - } break; - case 5: { - math::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); - if (ddX) { - math::PadFunction( - ctx, input_pad, transformed_ddX_channel, pad_value, - &transformed_ddX); - } - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "ConvOp only support tensors with 4 or 5 dimensions.")); - } - - } else { - transformed_X.ShareDataWith(transformed_X_channel); - if (ddX) { - transformed_ddX.ShareDataWith(transformed_ddX_channel); - } - if (dX) { - transformed_dX.ShareDataWith(transformed_dX_channel); - } - - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - const T* x = transformed_X.data(); - - int iwo_group = groups; - int c_group = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_group = 1; - c_group = groups; - groups = 1; -#endif - auto dtype = platform::CudnnDataType::type; - - auto handle = dev_ctx.cudnn_handle(); - - ConvArgs args1{&transformed_ddX, - W, - &transformed_ddO_channel, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args2{ - &transformed_X, ddW, &transformed_ddO_channel, strides, padding_common, - dilations, dtype}; - ConvArgs args3{&transformed_ddX, - dW, - &transformed_dO_channel, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args4{ - &transformed_dX, ddW, &transformed_dO_channel, strides, padding_common, - dilations, dtype}; - -#ifdef PADDLE_WITH_HIP - miopenConvFwdAlgorithm_t fwd_algo1 = - static_cast(0); - miopenConvFwdAlgorithm_t fwd_algo2 = - static_cast(0); - miopenConvBwdDataAlgorithm_t data_algo = - static_cast(0); - miopenConvBwdWeightsAlgorithm_t filter_algo = - static_cast(0); -#else - cudnnConvolutionFwdAlgo_t fwd_algo1 = - static_cast(0); - cudnnConvolutionFwdAlgo_t fwd_algo2 = - static_cast(0); - cudnnConvolutionBwdDataAlgo_t data_algo = - static_cast(0); - cudnnConvolutionBwdFilterAlgo_t filter_algo = - static_cast(0); -#endif - - auto layout = GetCudnnTensorFormat(DataLayout::kNCHW); - - // ddo = conv(ddI, W) + conv(I, ddW) - size_t workspace_size = 0; - - T* transformed_ddy_channel = nullptr; - if (ddO) { - ddy = ddO->data(); - transformed_ddy_channel = transformed_ddO_channel.data(); - if (ddX) { - args1.handle = handle; - args1.idesc.set(transformed_ddX, iwo_group); - args1.wdesc.set(*W, layout, iwo_group); - args1.odesc.set(transformed_ddO_channel, iwo_group); - args1.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); - -#ifdef PADDLE_WITH_HIP - using search1 = SearchAlgorithm; - workspace_size = search1::GetWorkspaceSize(args1); - fwd_algo1 = search1::Find(args1, exhaustive_search, false, - workspace_size, ctx); -#else - using search1 = SearchAlgorithm; - fwd_algo1 = search1::Find(args1, exhaustive_search, false, ctx); - workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1); -#endif - } - - if (ddW) { - ddw = ddW->data(); - args2.handle = handle; - args2.idesc.set(transformed_X, iwo_group); - args2.wdesc.set(*ddW, layout, iwo_group); - args2.odesc.set(transformed_ddO_channel, iwo_group); - args2.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); - -#ifdef PADDLE_WITH_HIP - using search2 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search2::GetWorkspaceSize(args2)); - fwd_algo2 = search2::Find(args2, exhaustive_search, false, - workspace_size, ctx); -#else - using search2 = SearchAlgorithm; - fwd_algo2 = search2::Find(args2, exhaustive_search, false, ctx); - workspace_size = std::max(workspace_size, - search2::GetWorkspaceSize(args2, fwd_algo2)); -#endif - } - } - - if (dW && ddX) { - dw = dW->data(); - args3.handle = handle; - args3.idesc.set(transformed_ddX, iwo_group); - args3.wdesc.set(*dW, layout, iwo_group); - args3.odesc.set(transformed_dO_channel, iwo_group); - args3.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); - -#ifdef PADDLE_WITH_HIP - using search3 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search3::GetWorkspaceSize(args3)); - filter_algo = search3::Find(args3, exhaustive_search, deterministic, - workspace_size, ctx); -#else - using search3 = SearchAlgorithm; - filter_algo = - search3::Find(args3, exhaustive_search, deterministic, ctx); - workspace_size = std::max(workspace_size, - search3::GetWorkspaceSize(args3, filter_algo)); -#endif - } - - if (ddW && dX) { - transformed_dx = transformed_dX.data(); - - args4.handle = handle; - args4.idesc.set(transformed_dX, iwo_group); - args4.wdesc.set(*ddW, layout, iwo_group); - args4.odesc.set(transformed_dO_channel, iwo_group); - args4.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); - -#ifdef PADDLE_WITH_HIP - using search4 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search4::GetWorkspaceSize(args4)); - data_algo = search4::Find(args4, exhaustive_search, deterministic, - workspace_size, ctx); -#else - using search4 = SearchAlgorithm; - data_algo = - search4::Find(args4, exhaustive_search, deterministic, ctx); - workspace_size = - std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); -#endif - } - - int i_n, i_c, i_d, i_h, i_w; - GetNCDHW(transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, - &i_w); - - int o_n, o_c, o_d, o_h, o_w; - GetNCDHW(transformed_dO_channel.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, - &o_h, &o_w); - - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = W->numel() / groups; - - ScalingParamType alpha = 1.0f; - ScalingParamType beta = 0.0f; - - // NOTE(zhiqiu): inplace addto is not supportted in double grad yet. - // ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : - // 0.0f; - // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr("use_addto"); - auto wkspace_handle = dev_ctx.cudnn_workspace_handle(); - - if (ddO) { - if (ddX) { - ddx = transformed_ddX.data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args1.idesc.desc(), ddx, - args1.wdesc.desc(), w, args1.cdesc.desc(), fwd_algo1, - &beta, args1.odesc.desc(), transformed_ddy_channel, - workspace_ptr, workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionForward( - handle, &alpha, args1.idesc.desc(), - ddx + i * group_offset_in, args1.wdesc.desc(), - w + i * group_offset_filter, args1.cdesc.desc(), - fwd_algo1, workspace_ptr, workspace_size, &beta, - args1.odesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); - } -#endif - } - if (ddW) { -#ifdef PADDLE_WITH_HIP - // MIOPEN ONLY support beta to be 0.0f - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args2.idesc.desc(), x, args2.wdesc.desc(), - ddw, args2.cdesc.desc(), fwd_algo2, &beta, - args2.odesc.desc(), transformed_ddy_channel, - workspace_ptr, workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionForward( - handle, &alpha, args2.idesc.desc(), - x + i * group_offset_in, args2.wdesc.desc(), - ddw + i * group_offset_filter, args2.cdesc.desc(), - fwd_algo2, workspace_ptr, workspace_size, &alpha, - args2.odesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); - } -#endif - } - if (channel_last) { - TransToChannelLast( - ctx, &transformed_ddO_channel, ddO); - } - } - T* transformed_dy_channel = transformed_dO_channel.data(); - if (dW && ddX) { - ddx = transformed_ddX.data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardWeights( - handle, &alpha, args3.odesc.desc(), transformed_dy_channel, - args3.idesc.desc(), ddx, args3.cdesc.desc(), filter_algo, - &beta, args3.wdesc.desc(), dw, workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, args3.idesc.desc(), - ddx + i * group_offset_in, args3.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args3.cdesc.desc(), filter_algo, workspace_ptr, - workspace_size, &beta, args3.wdesc.desc(), - dw + i * group_offset_filter)); - }, - workspace_size); - } -#endif - } - - if (dX && ddW) { - ddw = ddW->data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args4.odesc.desc(), transformed_dy_channel, - args4.wdesc.desc(), ddw, args4.cdesc.desc(), data_algo, - &beta, args4.idesc.desc(), transformed_dx, workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, args4.wdesc.desc(), - ddw + i * group_offset_filter, args4.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args4.cdesc.desc(), data_algo, workspace_ptr, - workspace_size, &beta, args4.idesc.desc(), - transformed_dx + i * group_offset_in)); - }, - workspace_size); - } -#endif - - if (!is_sys_pad) { - // reverse padded input - std::vector starts(X->dims().size(), 0); - std::vector axes(X->dims().size(), 0); - - for (size_t i = 0; i < X->dims().size(); ++i) { - starts[i] = input_pad[2 * i]; - axes[i] = i; - } - if (X->dims().size() == 4) { - RemovePaddingSlice( - ctx, &transformed_dX, &transformed_dX_channel, starts, axes); - } else { - RemovePaddingSlice( - ctx, &transformed_dX, &transformed_dX_channel, starts, axes); - } - } - if (channel_last) { - TransToChannelLast( - ctx, &transformed_dX_channel, dX); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace plat = paddle::platform; -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv2d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue -// Use depthwise_conv2d in MIOPEN to resolve this issue -REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d_grad_grad, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); - -REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv3d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -#else -#if CUDNN_VERSION_MIN(8, 1, 0) -REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv2d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); - -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d_grad_grad, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -#else -REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv2d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); - -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d_grad_grad, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -#endif - -REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv3d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -#endif diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h index 9c9795143eb78dc5c1b22ec792d8753f915c976e..66f718693847837a4d169a5cab9629a1f668244f 100644 --- a/paddle/fluid/operators/conv_miopen_helper.h +++ b/paddle/fluid/operators/conv_miopen_helper.h @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace paddle { namespace operators { @@ -51,12 +52,11 @@ static inline void GetNCDHW(const framework::DDim& dims, } template -static void RemovePaddingSlice(const framework::ExecutionContext& context, +static void RemovePaddingSlice(const phi::GPUContext& context, const Tensor* input, Tensor* out, const std::vector& starts, const std::vector& axes) { - auto& place = - *context.template device_context().eigen_device(); + auto& place = *context.eigen_device(); auto in_dims = input->dims(); auto new_out_dims = out->dims(); auto offsets = Eigen::array(); @@ -128,11 +128,10 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, bool deterministic, size_t workspace_size, - const framework::ExecutionContext& ctx) { + const phi::GPUContext& ctx) { algo_t algo; - auto& dev_ctx = ctx.template device_context(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); int find_count; miopenConvAlgoPerf_t find_result; @@ -170,11 +169,10 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, bool deterministic, size_t workspace_size, - const framework::ExecutionContext& ctx) { + const phi::GPUContext& ctx) { algo_t algo; - auto& dev_ctx = ctx.template device_context(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); int find_count; miopenConvAlgoPerf_t find_result; @@ -212,11 +210,10 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, bool deterministic, size_t workspace_size, - const framework::ExecutionContext& ctx) { + const phi::GPUContext& ctx) { algo_t algo; - auto& dev_ctx = ctx.template device_context(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); int find_count; miopenConvAlgoPerf_t find_result; diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index e345a4d2603b630508e299207984f4708217a1d8..8213e877f722433488cd826bb63cba376972c57a 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -205,14 +205,14 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( paddle::framework::DataTypeToString(input_data_type), paddle::framework::DataTypeToString(filter_data_type))); } -#ifndef PADDLE_WITH_ASCEND_CL - if (input_data_type == framework::proto::VarType::FP16) { - PADDLE_ENFORCE_EQ( - library, framework::LibraryType::kCUDNN, - platform::errors::InvalidArgument( - "float16 can only be used when CUDNN or NPU is used")); - } -#endif +// #ifndef PADDLE_WITH_ASCEND_CL +// if (input_data_type == framework::proto::VarType::FP16) { +// PADDLE_ENFORCE_EQ( +// library, framework::LibraryType::kCUDNN, +// platform::errors::InvalidArgument( +// "float16 can only be used when CUDNN or NPU is used")); +// } +// #endif #if PADDLE_WITH_CUDA if (input_data_type == framework::proto::VarType::BF16 && library == framework::LibraryType::kCUDNN) { @@ -869,42 +869,6 @@ REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad, ops::Conv3DDoubleGradMaker); REGISTER_OPERATOR(conv3d_grad_grad, ops::ConvOpDoubleGrad); -// depthwise conv kernel -// TODO(xingzhaolong): neon kernel for mobile -REGISTER_OP_CPU_KERNEL( - depthwise_conv2d, - ops::GemmConvKernel, - ops::GemmConvKernel); - -REGISTER_OP_CPU_KERNEL( - depthwise_conv2d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); - -REGISTER_OP_CPU_KERNEL( - conv2d, ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CPU_KERNEL( - conv2d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); -REGISTER_OP_CPU_KERNEL( - conv2d_grad_grad, - ops::GemmConvDoubleGradKernel, - ops::GemmConvDoubleGradKernel); - -REGISTER_OP_CPU_KERNEL( - conv3d, ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CPU_KERNEL( - conv3d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); -REGISTER_OP_CPU_KERNEL( - conv3d_grad_grad, - ops::GemmConvDoubleGradKernel, - ops::GemmConvDoubleGradKernel); - REGISTER_OP_VERSION(conv2d) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/conv_op.cu.cc b/paddle/fluid/operators/conv_op.cu.cc deleted file mode 100644 index d07593f5c02e9129c1f333667baccb0531bc31f9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/conv_op.cu.cc +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/conv_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d, - ops::DepthwiseConvKernel, - ops::DepthwiseConvKernel); - -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d_grad, - ops::DepthwiseConvGradKernel, - ops::DepthwiseConvGradKernel); - -REGISTER_OP_CUDA_KERNEL( - conv2d, ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CUDA_KERNEL( - conv2d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); - -REGISTER_OP_CUDA_KERNEL( - conv3d, ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CUDA_KERNEL( - conv3d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index 26166362da8a2984dc3c0670b186b85800767fb7..a5d888765bf37d45d501a3dbe5437f7c2ab5fc51 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/layout_utils.h" -#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/vol2col.h" #include "paddle/phi/kernels/funcs/blas/blas.h" @@ -214,817 +213,5 @@ class ConvOpDoubleGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override; }; -template -class GemmConvKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - // The filter will be reshaped in the calculations, - // so here use an assignment operation, - // that avoids modifying the variable in the Scope. - Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); - output->mutable_data(context.GetPlace()); - - const int groups = context.Attr("groups"); - const std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - const std::string padding_algorithm = - context.Attr("padding_algorithm"); - const std::string data_format = context.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - Tensor transformed_input(input->dtype()); - Tensor transformed_output(output->dtype()); - - if (channel_last) { - ResizeToChannelFirst(context, input, - &transformed_input); - TransToChannelFirst(context, input, &transformed_input); - - ResizeToChannelFirst(context, output, - &transformed_output); - - } else { - transformed_input = *input; - transformed_output = *output; - } - - // update padding and dilation - auto trans_in_dims = transformed_input.dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims = - phi::slice_ddim(trans_in_dims, 2, trans_in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - auto& dev_ctx = context.template device_context(); - - const int batch_size = static_cast(transformed_input.dims()[0]); - - // filter_shape_vec: - // {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} - std::vector filter_shape_vec(phi::vectorize(filter.dims())); - - // output_shape_vec: - // {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} - std::vector output_shape_vec( - phi::vectorize(transformed_output.dims())); - - // use col_shape in the im2col calculation - // col_shape_vec: - // {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, - // o_d,o_h, o_w} - size_t data_dim = filter_shape_vec.size() - 2; - - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = trans_in_dims[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - - framework::DDim col_shape(phi::make_ddim(col_shape_vec)); - - // use col_matrix_shape in the gemm calculation - // size: - // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h * - // o_w) - - framework::DDim col_matrix_shape = phi::flatten_to_2d(col_shape, data_dim); - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - - Tensor col; - // col_matrix shares the same piece of data with col, - // but will be reshaped into a two-dimensional matrix shape - // to call the matrix multiplication interface. - Tensor col_matrix; - if (is_expand) { - col = context.AllocateTmpTensor(col_shape, dev_ctx); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim in_matrix_shape = phi::slice_ddim( - transformed_input.dims(), 1, transformed_input.dims().size()); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - - framework::DDim output_matrix_shape = { - transformed_output.dims()[1], - transformed_output.numel() / - (transformed_output.dims()[0] * transformed_output.dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(transformed_input.dims()[1]) / groups; - int out_step = static_cast(transformed_output.dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - auto blas = phi::funcs::GetBlas(dev_ctx); - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = - transformed_input.Slice(i, i + 1).Resize(in_matrix_shape); - Tensor out_batch = - transformed_output.Slice(i, i + 1).Resize(output_matrix_shape); - - for (int g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, in_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - - } else if (data_dim == 3U) { - vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(filter_slice, false, col_matrix, false, T(1.0), &out_slice, - T(0.0)); - } - } - if (channel_last) { - TransToChannelLast(context, &transformed_output, - output); - } - } -}; - -template -class GemmConvGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = - context.Input(framework::GradVarName("Output")); - Tensor* input_grad = - context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = - context.Output(framework::GradVarName("Filter")); - // The filter and filter_grad will be reshaped in the calculations, - // so here use an assignment operation, - // that avoids modifying the variable in the Scope. - Tensor filter = *context.Input("Filter"); - - if (!input_grad && !filter_grad) return; - - int groups = context.Attr("groups"); - const std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - const std::string padding_algorithm = - context.Attr("padding_algorithm"); - const std::string data_format = context.Attr("data_format"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - Tensor transformed_input(input->dtype()); - Tensor transformed_output_grad(output_grad->dtype()); - - if (channel_last) { - ResizeToChannelFirst(context, input, - &transformed_input); - TransToChannelFirst(context, input, &transformed_input); - - ResizeToChannelFirst(context, output_grad, - &transformed_output_grad); - TransToChannelFirst(context, output_grad, - &transformed_output_grad); - } else { - transformed_input = *input; - transformed_output_grad = *output_grad; - } - - // update padding and dilation - auto in_dims = transformed_input.dims(); - auto filter_dims = filter.dims(); - framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - const int batch_size = static_cast(transformed_input.dims()[0]); - - auto& dev_ctx = context.template device_context(); - - // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} - std::vector filter_shape_vec(phi::vectorize(filter.dims())); - // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} - std::vector output_shape_vec( - phi::vectorize(transformed_output_grad.dims())); - - // use col_shape in the im2col calculation - // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d, - // o_h, o_w} - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = transformed_input.dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(phi::make_ddim(col_shape_vec)); - - // use col_matrix_shape in the gemm calculation - // size: (i_c/g * k_h * k_w, o_h * o_w) - // or - // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w) - framework::DDim col_matrix_shape = - phi::flatten_to_2d(col_shape, data_dim + 1); - - framework::DDim input_shape = phi::slice_ddim( - transformed_input.dims(), 1, transformed_input.dims().size()); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - - framework::DDim output_matrix_shape = { - transformed_output_grad.dims()[1], - transformed_output_grad.numel() / (transformed_output_grad.dims()[0] * - transformed_output_grad.dims()[1])}; - - // convolution backward input operator: gemm + col2im(or col2vol) - // convolution backward weight operator: im2col(or vol2col) + gemm - int in_step = static_cast(transformed_input.dims()[1]) / groups; - int out_step = static_cast(transformed_output_grad.dims()[1]) / groups; - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - - Tensor col; - // col_matrix shares the same piece of data with col, - // but will be reshaped into a two-dimensional matrix shape - // to call the matrix multiplication interface. - Tensor col_matrix; - if (is_expand) { - col = context.AllocateTmpTensor(col_shape, dev_ctx); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - phi::funcs::SetConstant set_zero; - auto blas = phi::funcs::GetBlas(dev_ctx); - - if (input_grad) { - input_grad->mutable_data(context.GetPlace()); - Tensor transformed_input_grad(input_grad->dtype()); - if (channel_last) { - ResizeToChannelFirst(context, input_grad, - &transformed_input_grad); - - } else { - transformed_input_grad = *input_grad; - } - // if is_expand is false, the operation of set_zero is unnecessary, - // because math::matmul will reset input_grad. - if (is_expand) { - set_zero(dev_ctx, &transformed_input_grad, static_cast(0)); - } - math::Col2VolFunctor col2vol; - math::Col2ImFunctor col2im; - - for (int i = 0; i < batch_size; i++) { - Tensor out_grad_batch = - transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor in_grad_batch = - transformed_input_grad.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; g++) { - // gemm - Tensor out_grad_slice = - out_grad_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - - Tensor in_grad_slice = - in_grad_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col_matrix.ShareDataWith(in_grad_slice); - col_matrix.Resize(col_matrix_shape); - } - blas.MatMul(filter_slice, true, out_grad_slice, false, T(1.0), - &col_matrix, T(0.0)); - - if (is_expand && data_dim == 2U) { - col2im(dev_ctx, col, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &in_grad_slice); - } else if (is_expand && data_dim == 3U) { - col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice); - } - } - } - if (channel_last) { - TransToChannelLast(context, &transformed_input_grad, - input_grad); - } - } - - if (filter_grad) { - filter_grad->mutable_data(context.GetPlace()); - Tensor filter_grad_ = *filter_grad; - filter_grad_.Resize(filter_matrix_shape); - set_zero(dev_ctx, filter_grad, static_cast(0)); - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; i++) { - Tensor out_grad_batch = - transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor in_batch = transformed_input.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; g++) { - // im2col - Tensor out_grad_slice = - out_grad_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, in_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - - } else if (data_dim == 3U) { - vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor filter_grad_slice = - filter_grad_.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(out_grad_slice, false, col_matrix, true, T(1.0), - &filter_grad_slice, T(1.0)); - } - } - } - } -}; - -template -class GemmConvDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CPUPlace.")); - const Tensor* X = ctx.Input("Input"); - const Tensor* dY = ctx.Input("DOutput"); - const Tensor* ddX = ctx.Input("DDInput"); - const Tensor* ddW_in = ctx.Input("DDFilter"); - - Tensor* ddY = ctx.Output("DDOutput"); - Tensor* dW = ctx.Output("DFilter"); - Tensor* dX = ctx.Output("DInput"); - Tensor W = GET_DATA_SAFELY(ctx.Input("Filter"), "Input", "Filter", - "GemmConvDoubleGrad"); - if (!ddY && !dW && !dX) return; - - const int groups = ctx.Attr("groups"); - const std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensor - Tensor transformed_X(X->dtype()); - Tensor transformed_dY(dY->dtype()); - Tensor transformed_ddX(X->dtype()); - - if (channel_last) { - ResizeToChannelFirst(ctx, X, &transformed_X); - TransToChannelFirst(ctx, X, &transformed_X); - - ResizeToChannelFirst(ctx, dY, &transformed_dY); - TransToChannelFirst(ctx, dY, &transformed_dY); - - if (ddX) { - ResizeToChannelFirst(ctx, ddX, &transformed_ddX); - TransToChannelFirst(ctx, ddX, &transformed_ddX); - } - } else { - transformed_X = *X; - transformed_dY = *dY; - if (ddX) { - transformed_ddX = *ddX; - } - } - - // update padding and dilation - auto in_dims = transformed_X.dims(); - auto filter_dims = W.dims(); - - framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - const int batch_size = static_cast(transformed_X.dims()[0]); - std::vector filter_shape_vec(phi::vectorize(W.dims())); - std::vector output_shape_vec( - phi::vectorize(transformed_dY.dims())); - - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - // col_shape [in_channel/group, kh, kw, oh, ow] - col_shape_vec[0] = transformed_X.dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(phi::make_ddim(col_shape_vec)); - // col_matrix_shape [in_channel/group * kh * kw, oh * ow] - framework::DDim col_matrix_shape = - phi::flatten_to_2d(col_shape, data_dim + 1); - // input_shape [Cin, H, W] - framework::DDim input_shape = - phi::slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size()); - // filter_matrix_shape [Cout, Cin * kh * kw] - framework::DDim filter_matrix_shape = {W.dims()[0], - W.numel() / W.dims()[0]}; - - W.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - transformed_dY.dims()[1], - transformed_dY.numel() / - (transformed_dY.dims()[0] * transformed_dY.dims()[1])}; - int in_step = static_cast(transformed_X.dims()[1]) / groups; - int out_step = static_cast(transformed_dY.dims()[1]) / groups; - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col = ctx.AllocateTmpTensor(col_shape, dev_ctx); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - phi::funcs::SetConstant set_zero; - auto blas = phi::funcs::GetBlas(dev_ctx); - - // dx convolution double grad: gemm + col2im(col2vol) - // dx = ddw * dy ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout, - // oH, oW) - if (dX && ddW_in) { - Tensor ddW; - ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); - dX->mutable_data(ctx.GetPlace()); - - Tensor transformed_dX(dX->dtype()); - - if (channel_last) { - ResizeToChannelFirst(ctx, dX, &transformed_dX); - - } else { - transformed_dX = *dX; - } - // if is_expand is false, the operation of set_zero is unnecessary - // because math::matmul will reset dx - if (is_expand) { - set_zero(dev_ctx, &transformed_dX, static_cast(0)); - } - math::Col2VolFunctor col2vol; - math::Col2ImFunctor col2im; - - for (int i = 0; i < batch_size; i++) { - Tensor dy_batch = - transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; g++) { - // gemm - Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); - Tensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col_matrix.ShareDataWith(dx_slice); - col_matrix.Resize(col_matrix_shape); - } - blas.MatMul(ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, - T(0.0)); - - if (is_expand && data_dim == 2U) { - col2im(dev_ctx, col, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &dx_slice); - } else if (is_expand && data_dim == 3U) { - col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice); - } - } - } - if (channel_last) { - TransToChannelLast(ctx, &transformed_dX, dX); - } - } - - // dw = ddx * dy ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout, - // oH, oW) - // dw convolution double grad: im2col(vol2col) + gemm - if (dW && ddX) { - dW->mutable_data(ctx.GetPlace()); - set_zero(dev_ctx, dW, static_cast(0)); - Tensor dW_arr = *dW; - dW_arr.Resize(filter_matrix_shape); - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; ++i) { - Tensor dy_batch = - transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; ++g) { - // im2col - Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col.ShareDataWith(ddx_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, ddx_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); - } - - Tensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(dy_slice, false, col_matrix, true, T(1.0), &dw_slice, - T(1.0)); - } - } - } - - // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W), - // w/ddw(Cout, Cin, kh, kw) - // ddy convolution double grad: im2col(vol2col) + gemm - if (ddY) { - ddY->mutable_data(ctx.GetPlace()); - - Tensor transformed_ddY(ddY->dtype()); - if (channel_last) { - ResizeToChannelFirst(ctx, ddY, &transformed_ddY); - } else { - transformed_ddY = *ddY; - } - - set_zero(dev_ctx, &transformed_ddY, static_cast(0)); - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; ++i) { - Tensor ddy_batch = - transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape); - for (int g = 0; g < groups; ++g) { - // gemm - Tensor ddy_slice = ddy_batch.Slice(g * out_step, (g + 1) * out_step); - - if (ddX) { - Tensor ddx_batch = - transformed_ddX.Slice(i, i + 1).Resize(input_shape); - Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col.ShareDataWith(ddx_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, ddx_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); - } - Tensor w_slice = W.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(w_slice, false, col_matrix, false, T(1.0), &ddy_slice, - T(0.0)); - } - - if (ddW_in) { - Tensor x_batch = transformed_X.Slice(i, i + 1).Resize(input_shape); - Tensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step); - - Tensor ddW; - ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); - if (!is_expand) { - col.ShareDataWith(x_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, x_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, - T(1.0)); - } - } - } - if (channel_last) { - TransToChannelLast(ctx, &transformed_ddY, ddY); - } - } - } -}; - -template -class DepthwiseConvKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); - output->mutable_data(context.GetPlace()); - - const std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - bool fuse_relu = context.Attr("fuse_relu_before_depthwise_conv"); - - const std::string padding_algorithm = - context.Attr("padding_algorithm"); - const std::string data_format = context.Attr("data_format"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - if (channel_last) { - PADDLE_ENFORCE_EQ( - output->dims()[output->dims().size() - 1] % - input->dims()[input->dims().size() - 1], - 0, platform::errors::InvalidArgument( - "ShapeError: The output channels must be a multiple of the " - "input channels. But receivced output channel number is %d " - "and input channel number is %d", - output->dims()[output->dims().size() - 1], - input->dims()[input->dims().size() - 1])); - } else { - PADDLE_ENFORCE_EQ( - output->dims()[1] % input->dims()[1], 0, - platform::errors::InvalidArgument( - "ShapeError: The output channels must be a multiple of the " - "input channels. But receivced output channel number is %d " - "and input channel number is %d", - output->dims()[1], input->dims()[1])); - } - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_format); - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true; - if (!is_sys_pad) { - for (size_t i = 0; i < strides.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - auto& dev_ctx = context.template device_context(); - - if (fuse_relu) { - math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, - output, data_layout); - } else { - math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, - output, data_layout); - } - } -}; - -template -class DepthwiseConvGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = - context.Input(framework::GradVarName("Output")); - Tensor* input_grad = - context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = - context.Output(framework::GradVarName("Filter")); - Tensor filter = *context.Input("Filter"); - - if (!input_grad && !filter_grad) return; - - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - bool fuse_relu = context.Attr("fuse_relu_before_depthwise_conv"); - const std::string padding_algorithm = - context.Attr("padding_algorithm"); - const std::string data_format = context.Attr("data_format"); - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_format); - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true; - if (!is_sys_pad) { - for (size_t i = 0; i < strides.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - - if (input_grad) { - input_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, input_grad, static_cast(0)); - - if (fuse_relu) { - math::DepthwiseConvInputGradFunctor - depthwiseConvInputGrad; - depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, - paddings, dilations, input_grad, data_layout); - } else { - math::DepthwiseConvInputGradFunctor - depthwiseConvInputGrad; - depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, - paddings, dilations, input_grad, data_layout); - } - } - - if (filter_grad) { - filter_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, filter_grad, static_cast(0)); - if (fuse_relu) { - math::DepthwiseConvFilterGradFunctor - depthwiseConvFilterGrad; - depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, - paddings, dilations, filter_grad, data_layout); - } else { - math::DepthwiseConvFilterGradFunctor - depthwiseConvFilterGrad; - depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, - paddings, dilations, filter_grad, data_layout); - } - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu index 4b8f9d7e6ca8d2f1dae99f1d034c53daf948f922..1841b78af32dd95d6884d5eb78ad30322ba7723e 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu @@ -21,8 +21,8 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_cudnn_helper.h" #endif #include "paddle/fluid/operators/conv_transpose_op.h" -#include "paddle/fluid/operators/math/padding.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/padding.h" namespace paddle { namespace operators { @@ -108,7 +108,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { in_data_dims, strides, ksize); int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); std::vector input_pad(input_transpose.dims().size() * 2, 0); Tensor transformed_input; @@ -139,12 +139,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, input_transpose, pad_value, &transformed_input); + phi::funcs::PadFunction( + dev_ctx, input_pad, input_transpose, pad_value, + &transformed_input); } break; case 5: { - math::PadFunction( - ctx, input_pad, input_transpose, pad_value, &transformed_input); + phi::funcs::PadFunction( + dev_ctx, input_pad, input_transpose, pad_value, + &transformed_input); } break; default: PADDLE_THROW(platform::errors::InvalidArgument( @@ -242,10 +244,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP using search = SearchAlgorithm; workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args)); - algo = search::Find(args, false, deterministic, workspace_size, ctx); + algo = search::Find( + args, false, deterministic, workspace_size, + ctx.template device_context()); #else using search = SearchAlgorithm; - algo = search::Find(args, false, deterministic, ctx); + algo = search::Find( + args, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args, algo)); #endif @@ -375,7 +381,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { in_data_dims, strides, ksize); int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); std::vector input_pad(input_transpose.dims().size() * 2, 0); Tensor transformed_output_grad; @@ -407,13 +413,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, output_grad_transpose, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, output_grad_transpose, pad_value, &transformed_output_grad); } break; case 5: { - math::PadFunction( - ctx, input_pad, output_grad_transpose, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, output_grad_transpose, pad_value, &transformed_output_grad); } break; default: @@ -499,11 +505,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { using search1 = SearchAlgorithm; workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1)); - data_algo = - search1::Find(args1, false, deterministic, workspace_size, ctx); + data_algo = search1::Find( + args1, false, deterministic, workspace_size, + ctx.template device_context()); #else using search1 = SearchAlgorithm; - data_algo = search1::Find(args1, false, deterministic, ctx); + data_algo = search1::Find( + args1, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); #endif @@ -521,11 +530,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { using search2 = SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); - filter_algo = - search2::Find(args2, false, deterministic, workspace_size, ctx); + filter_algo = search2::Find( + args2, false, deterministic, workspace_size, + ctx.template device_context()); #else using search2 = SearchAlgorithm; - filter_algo = search2::Find(args2, false, deterministic, ctx); + filter_algo = search2::Find( + args2, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2, filter_algo)); #endif @@ -735,7 +747,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { in_data_dims, strides, ksize); int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); Tensor transformed_X(X->type()); Tensor transformed_ddX(X->type()); @@ -794,26 +806,28 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_X_channel, pad_value, + &transformed_X); if (dO) { - math::PadFunction( - ctx, input_pad, transformed_dO_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_dO_channel, pad_value, &transformed_dO); } if (ddX) { - math::PadFunction( - ctx, input_pad, transformed_ddX_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_ddX_channel, pad_value, &transformed_ddX); } } break; case 5: { - math::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_X_channel, pad_value, + &transformed_X); if (ddX) { - math::PadFunction( - ctx, input_pad, transformed_ddX_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_ddX_channel, pad_value, &transformed_ddX); } } break; @@ -940,11 +954,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; workspace_size = search1::GetWorkspaceSize(args1); - bwd_algo1 = - search1::Find(args1, false, deterministic, workspace_size, ctx); + bwd_algo1 = search1::Find( + args1, false, deterministic, workspace_size, + ctx.template device_context()); #else using search1 = SearchAlgorithm; - bwd_algo1 = search1::Find(args1, false, deterministic, ctx); + bwd_algo1 = search1::Find( + args1, false, deterministic, + ctx.template device_context()); workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1); #endif } @@ -961,11 +978,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { using search2 = SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); - bwd_algo2 = - search2::Find(args2, false, deterministic, workspace_size, ctx); + bwd_algo2 = search2::Find( + args2, false, deterministic, workspace_size, + ctx.template device_context()); #else using search2 = SearchAlgorithm; - bwd_algo2 = search2::Find(args2, false, deterministic, ctx); + bwd_algo2 = search2::Find( + args2, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2, bwd_algo2)); #endif @@ -986,11 +1006,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { using search3 = SearchAlgorithm; workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); - filter_algo = - search3::Find(args3, false, deterministic, workspace_size, ctx); + filter_algo = search3::Find( + args3, false, deterministic, workspace_size, + ctx.template device_context()); #else using search3 = SearchAlgorithm; - filter_algo = search3::Find(args3, false, deterministic, ctx); + filter_algo = search3::Find( + args3, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo)); #endif @@ -1009,11 +1032,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { using search4 = SearchAlgorithm; workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); - data_algo = - search4::Find(args4, false, deterministic, workspace_size, ctx); + data_algo = search4::Find( + args4, false, deterministic, workspace_size, + ctx.template device_context()); #else using search4 = SearchAlgorithm; - data_algo = search4::Find(args4, false, deterministic, ctx); + data_algo = search4::Find( + args4, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); #endif diff --git a/paddle/fluid/operators/conv_transpose_op.cu b/paddle/fluid/operators/conv_transpose_op.cu index b2a4910222f1178d23e94eade9580248bb103c88..054cb4b33895b02a816cc2bff82b1c9052bc645d 100644 --- a/paddle/fluid/operators/conv_transpose_op.cu +++ b/paddle/fluid/operators/conv_transpose_op.cu @@ -13,10 +13,150 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/conv_transpose_op.h" +#include "paddle/phi/kernels/gpu/depthwise_conv.h" namespace ops = paddle::operators; using CUDA = paddle::platform::CUDADeviceContext; +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +using DDim = framework::DDim; + +template +class DepthwiseConvTransposeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const std::string data_layout_str = + context.Attr("data_format"); + const framework::DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + const Tensor* input = context.Input("Input"); + Tensor filter = *context.Input("Filter"); + Tensor* output = context.Output("Output"); + output->mutable_data(context.GetPlace()); + + int groups = context.Attr("groups"); + PADDLE_ENFORCE_EQ( + groups, filter.dims()[0], + platform::errors::InvalidArgument( + "groups should be error to the 1st dimension of filter. But " + "received groups is %d and filter dimension[0] is %d", + groups, filter.dims()[0])); + + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + std::string padding_algorithm = + context.Attr("padding_algorithm"); + for (auto v : dilations) { + PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument( + "dilations should be 1 in depthwise conv. " + "But received dilations is %d", + v)); + } + + auto in_dims = input->dims(); + auto filter_dims = filter.dims(); + + framework::DDim in_data_dims; + if (data_layout != framework::DataLayout::kNHWC) { + in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); + } + framework::DDim filter_data_dims = + phi::slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = phi::vectorize(filter_data_dims); + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); + + output->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, output, static_cast(0)); + + math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + depthwiseConvInputGrad( + static_cast::TYPE&>(dev_ctx), + *output, filter, *input, strides, + std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, + dilations, output, data_layout); + } +}; + +template +class DepthwiseConvTransposeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const std::string data_layout_str = + context.Attr("data_format"); + const framework::DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + const Tensor* input = context.Input("Input"); + const Tensor* output_grad = + context.Input(framework::GradVarName("Output")); + Tensor* input_grad = + context.Output(framework::GradVarName("Input")); + Tensor* filter_grad = + context.Output(framework::GradVarName("Filter")); + Tensor filter = *context.Input("Filter"); + + if (!input_grad && !filter_grad) return; + + auto& dev_ctx = context.template device_context(); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + std::string padding_algorithm = + context.Attr("padding_algorithm"); + + auto in_dims = input->dims(); + auto filter_dims = filter.dims(); + + framework::DDim in_data_dims; + if (data_layout != framework::DataLayout::kNHWC) { + in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); + } + framework::DDim filter_data_dims = + phi::slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = phi::vectorize(filter_data_dims); + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); + + if (input_grad) { + math::DepthwiseConvFunctor depthwiseConv; + depthwiseConv( + static_cast::TYPE&>(dev_ctx), + *output_grad, filter, strides, + std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, + dilations, input_grad, data_layout); + } + + if (filter_grad) { + phi::funcs::SetConstant set_zero; + filter_grad->mutable_data(context.GetPlace()); + set_zero(dev_ctx, filter_grad, static_cast(0)); + + math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + depthwiseConvFilterGrad( + static_cast::TYPE&>(dev_ctx), + *output_grad, *input, strides, + std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, + dilations, filter_grad, data_layout); + } + } +}; + +} // namespace operators +} // namespace paddle // conv2d REGISTER_OP_CUDA_KERNEL(conv2d_transpose, ops::GemmConvTransposeKernel, diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h index 76d6ad6bf2ff7361a90fb6f013f989db5a2b8845..ee0fb7ab3683364f6db3cffd7ddef67c61f19433 100644 --- a/paddle/fluid/operators/conv_transpose_op.h +++ b/paddle/fluid/operators/conv_transpose_op.h @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/math/concat_and_split.h" -#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/vol2col.h" #include "paddle/phi/kernels/funcs/blas/blas.h" @@ -578,130 +577,5 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { } }; -template -class DepthwiseConvTransposeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const std::string data_layout_str = - context.Attr("data_format"); - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const Tensor* input = context.Input("Input"); - Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); - output->mutable_data(context.GetPlace()); - - int groups = context.Attr("groups"); - PADDLE_ENFORCE_EQ( - groups, filter.dims()[0], - platform::errors::InvalidArgument( - "groups should be error to the 1st dimension of filter. But " - "received groups is %d and filter dimension[0] is %d", - groups, filter.dims()[0])); - - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - for (auto v : dilations) { - PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument( - "dilations should be 1 in depthwise conv. " - "But received dilations is %d", - v)); - } - - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - output->mutable_data(context.GetPlace()); - auto& dev_ctx = context.template device_context(); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, output, static_cast(0)); - - math::DepthwiseConvInputGradFunctor - depthwiseConvInputGrad; - depthwiseConvInputGrad( - dev_ctx, *output, filter, *input, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, output, data_layout); - } -}; - -template -class DepthwiseConvTransposeGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const std::string data_layout_str = - context.Attr("data_format"); - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = - context.Input(framework::GradVarName("Output")); - Tensor* input_grad = - context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = - context.Output(framework::GradVarName("Filter")); - Tensor filter = *context.Input("Filter"); - - if (!input_grad && !filter_grad) return; - - auto& dev_ctx = context.template device_context(); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - if (input_grad) { - math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv( - dev_ctx, *output_grad, filter, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, input_grad, data_layout); - } - - if (filter_grad) { - phi::funcs::SetConstant set_zero; - filter_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, filter_grad, static_cast(0)); - - math::DepthwiseConvFilterGradFunctor - depthwiseConvFilterGrad; - depthwiseConvFilterGrad( - dev_ctx, *output_grad, *input, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, filter_grad, data_layout); - } - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/cross_op.cc b/paddle/fluid/operators/cross_op.cc index fe00ee06603f0ecf2e3fa6ac367303a70702508f..674b75625d1983ba97f3d47ee154beff79c42dad 100644 --- a/paddle/fluid/operators/cross_op.cc +++ b/paddle/fluid/operators/cross_op.cc @@ -109,8 +109,8 @@ class CrossGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(cross, CrossInferShapeFunctor, - PT_INFER_META(phi::CrossInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(cross, CrossInferShapeFunctor, + PD_INFER_META(phi::CrossInferMeta)); REGISTER_OPERATOR(cross, ops::CrossOp, ops::CrossOpMaker, ops::CrossGradMaker, ops::CrossGradMaker, diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h deleted file mode 100644 index ab3860ecafc3569c13b0b9e5c882df9ddc03e190..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/cum_op.h +++ /dev/null @@ -1,115 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace operators { - -template -class CumKernel : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - - void Compute(const framework::ExecutionContext& context) const override { - auto& X = GET_DATA_SAFELY(context.Input("X"), "Input", - "X", "Cum"); - - auto& Out = GET_DATA_SAFELY(context.Output("Out"), - "Output", "Out", "Cum"); - int axis = context.Attr("axis"); - bool exclusive = context.Attr("exclusive"); - bool reverse = context.Attr("reverse"); - auto out_dims = Out.dims(); - - PADDLE_ENFORCE_EQ( - axis < out_dims.size() && axis >= (0 - out_dims.size()), true, - platform::errors::OutOfRange( - "Attr(axis) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(axis) = %d.", - out_dims.size(), out_dims.size() - 1, axis)); - if (axis < 0) { - axis += out_dims.size(); - } - - Out.template mutable_data(context.GetPlace()); - - int pre = 1; - int post = 1; - int mid = out_dims[axis]; - for (int i = 0; i < axis; ++i) { - pre *= out_dims[i]; - } - for (int i = axis + 1; i < out_dims.size(); ++i) { - post *= out_dims[i]; - } - - auto x = framework::EigenVector::Flatten(X); - auto out = framework::EigenVector::Flatten(Out); - auto* place = - context.template device_context().eigen_device(); - - using IndexT = Eigen::DenseIndex; - if (pre == 1) { - if (post == 1) { - ComputeImp(*place, Eigen::DSizes(mid), x, out, - /* axis= */ 0, reverse, exclusive); - } else { - ComputeImp(*place, Eigen::DSizes(mid, post), x, out, - /* axis= */ 0, reverse, exclusive); - } - } else { - if (post == 1) { - ComputeImp(*place, Eigen::DSizes(pre, mid), x, out, - /* axis= */ 1, reverse, exclusive); - } else { - ComputeImp(*place, Eigen::DSizes(pre, mid, post), x, out, - /* axis= */ 1, reverse, exclusive); - } - } - } - - private: - template - void ComputeImp(Device d, const Dim& dims, X x, Out out, int axis, - bool reverse, bool exclusive) const { - if (!reverse) { - out.reshape(dims).device(d) = Functor()(x.reshape(dims), axis, exclusive); - } else { - std::array rev; - rev.fill(false); - rev[axis] = reverse; - out.reshape(dims).device(d) = - Functor()(x.reshape(dims).reverse(rev), axis, exclusive).reverse(rev); - } - } -}; - -template -struct CumsumFunctor { - using ELEMENT_TYPE = T; - template - const typename X::TensorScanSumOp operator()(X x, int axis, - bool exclusive) const { - return x.cumsum(axis, exclusive); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc index 9fa355a924612651556f2a79711cae4ce17379f8..11633fb0b870327f14e4454b3f94a43940a9df53 100644 --- a/paddle/fluid/operators/cumsum_op.cc +++ b/paddle/fluid/operators/cumsum_op.cc @@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/cum_op.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -21,17 +24,6 @@ namespace operators { class CumOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - if (ctx->Attrs().Get("flatten")) { - ctx->SetOutputDim("Out", - phi::make_ddim({phi::product(ctx->GetInputDim("X"))})); - } else { - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - } - - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class CumsumOpMaker : public framework::OpProtoAndCheckerMaker { @@ -87,15 +79,12 @@ class CumsumGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; - +DECLARE_INFER_SHAPE_FUNCTOR(cumsum, CumsumInferShapeFunctor, + PD_INFER_META(phi::CumsumInferMeta)); REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker, - ops::CumsumGradMaker); -REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel>, - ops::CumKernel>, - ops::CumKernel>, - ops::CumKernel>, - ops::CumKernel>); + ops::CumsumGradMaker, + CumsumInferShapeFunctor); REGISTER_OP_VERSION(cumsum) .AddCheckpoint( diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu deleted file mode 100644 index 3402f42521f54f315390fe2162309fb204fd9b00..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/cumsum_op.cu +++ /dev/null @@ -1,325 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#ifdef __NVCC__ -#include -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include "paddle/fluid/operators/cum_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" - -using Tensor = paddle::framework::Tensor; -using LoDTensor = paddle::framework::LoDTensor; - -namespace paddle { -namespace operators { - -template -__device__ void BlockReverse(const T* idata, T* odata, int src_base, - int dst_base, int valid_item) { - __shared__ T sh_mem[BLOCK_SIZE]; - int tx = threadIdx.x; - - int offset = tx; - int in_index = src_base + offset; - if (offset >= valid_item) { - sh_mem[offset] = 0; - } else { - int sh_mem_index = BLOCK_SIZE - offset - 1; - T data = idata[in_index]; - sh_mem[sh_mem_index] = data; - } - - __syncthreads(); - int out_index = dst_base - offset; - if (offset < valid_item) { - int sh_mem_index = BLOCK_SIZE - offset - 1; - odata[out_index] = sh_mem[sh_mem_index]; - } -} - -template -__global__ void MatrixRowReverse(const T* matrix_data, T* reverse_data, - int reverse_size, int outer_size, - int inner_size) { - int bx = blockIdx.x; - int by = blockIdx.y; - int item_per_block = 1024; - - for (int block_offset = 0; block_offset < reverse_size; - block_offset += item_per_block) { - int valid_item = (reverse_size - block_offset > item_per_block) - ? item_per_block - : reverse_size - block_offset; - int src_offset = - bx * reverse_size + block_offset + by * (inner_size * reverse_size); - int dst_offset = bx * reverse_size + by * (inner_size * reverse_size) + - reverse_size - 1 - block_offset; - if (reverse_size < item_per_block) { - valid_item = reverse_size; - } - - BlockReverse(matrix_data, reverse_data, src_offset, dst_offset, - valid_item); - } -} - -template -struct BlockPrefixCallbackOp { - // Running prefix - T running_total; - // Constructor - __device__ BlockPrefixCallbackOp(T running_total) - : running_total(running_total) {} - // Callback operator to be entered by the first warp of threads in the block. - // Thread-0 is responsible for returning a value for seeding the block-wide - // scan. - __device__ T operator()(T block_aggregate) { - T old_prefix = running_total; - running_total = old_prefix + block_aggregate; - return old_prefix; - } -}; - -// No bank-conflict transpose -template -__global__ void MatrixTranspose(T* odata, const T* idata, size_t height, - size_t width) { - __shared__ T tile[TILE_DIM][TILE_DIM + 1]; - - int x = blockIdx.x * TILE_DIM + threadIdx.x; - int y = blockIdx.y * TILE_DIM + threadIdx.y; - for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { - if (x < width && (y + j) < height) { - tile[threadIdx.y + j][threadIdx.x] = idata[(y + j) * width + x]; - } else { - tile[threadIdx.y + j][threadIdx.x] = 0; - } - } - - __syncthreads(); - - x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset - y = blockIdx.x * TILE_DIM + threadIdx.y; - - for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { - if (x < height && (y + j) < width) { - odata[(y + j) * height + x] = tile[threadIdx.x][threadIdx.y + j]; - } - } -} - -template -__global__ void BlockScanKernel(T* d_out, const T* d_in, int inner_size, - int outer_size, int scan_size, bool exclusive) { - // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types - typedef cub::BlockLoad - BlockLoadT; - typedef cub::BlockStore - BlockStoreT; - typedef cub::BlockScan BlockScanT; - // Allocate type-safe, repurposable shared memory for collectives - __shared__ union { - typename BlockLoadT::TempStorage load; - typename BlockStoreT::TempStorage store; - typename BlockScanT::TempStorage scan; - } temp_storage; - - int bx = blockIdx.x; - int by = blockIdx.y; - - BlockPrefixCallbackOp prefix_op(0); - T block_aggregate = static_cast(0); - - // Obtain this block's segment of consecutive keys (blocked across threads) - int item_per_block = BLOCK_THREADS * ITEMS_PER_THREAD; - for (int block_offset = 0; block_offset < scan_size; - block_offset += BLOCK_THREADS * ITEMS_PER_THREAD) { - int valid_item = (scan_size - block_offset > item_per_block) - ? item_per_block - : (scan_size - block_offset); - if (scan_size < item_per_block) { - valid_item = scan_size; - } - - int offset = bx * scan_size + block_offset + by * (inner_size * scan_size); - - T thread_keys[ITEMS_PER_THREAD]; - BlockLoadT(temp_storage.load) - .Load(d_in + offset, thread_keys, valid_item, 0); - - __syncthreads(); - if (exclusive) { - T init_value = static_cast(0); - BlockScanT(temp_storage.scan) - .ExclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op); - } else { - BlockScanT(temp_storage.scan) - .InclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op); - } - __syncthreads(); - - BlockStoreT(temp_storage.store) - .Store(d_out + offset, thread_keys, valid_item); - } -} - -template -class CumCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - - int axis = context.Attr("axis"); - bool exclusive = context.Attr("exclusive"); - bool reverse = context.Attr("reverse"); - auto out_dims = out->dims(); - auto size = in->numel(); - - PADDLE_ENFORCE_EQ( - axis < out_dims.size() && axis >= (0 - out_dims.size()), true, - platform::errors::OutOfRange( - "Attr(axis) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(axis) = %d.", - out_dims.size(), out_dims.size() - 1, axis)); - if (axis < 0) { - axis += out_dims.size(); - } - - T* out_data = out->mutable_data(context.GetPlace()); - const T* in_data = in->data(); - - // Use thrust for parallel acceleration when the input size is equal to the - // length of the ‘axis’ dimension. - if (size == out_dims[axis]) { - if (reverse) { - thrust::device_ptr dev_ptr = - thrust::device_pointer_cast(in_data); - thrust::device_vector vec(dev_ptr, dev_ptr + size); - if (exclusive) { - thrust::exclusive_scan(thrust::device, vec.rbegin(), vec.rend(), - out_data); - } else { - thrust::inclusive_scan(thrust::device, vec.rbegin(), vec.rend(), - out_data); - } - thrust::reverse(thrust::device, out_data, out_data + size); - } else { - if (exclusive) { - thrust::exclusive_scan(thrust::device, in_data, in_data + size, - out_data); - } else { - thrust::inclusive_scan(thrust::device, in_data, in_data + size, - out_data); - } - } - return; - } - - size_t height = 1; - size_t width = 1; - for (size_t i = 0; i <= axis; i++) { - height *= out_dims[i]; - } - - for (size_t i = axis + 1; i < out_dims.size(); i++) { - width *= out_dims[i]; - } - int scan_size = out_dims[axis]; - bool transpose = (axis != out_dims.size() - 1); - - int tile_size = 32; - dim3 blocks(32, 8); - dim3 transpose_grids((width + tile_size - 1) / tile_size, - (height + tile_size - 1) / tile_size); - auto& dev_ctx = context.template device_context(); - framework::Tensor tmp; - tmp.Resize(out_dims); - auto* tmp_data = tmp.mutable_data(context.GetPlace()); - T* next_in_data = out_data; - T* next_out_data = tmp_data; - if (transpose) { - MatrixTranspose<<>>( - out_data, in_data, height, width); - next_in_data = out_data; - next_out_data = tmp_data; - } - auto swap_ptr = [](T*& ptr1, T*& ptr2) { - T* tmp = ptr2; - ptr2 = ptr1; - ptr1 = tmp; - }; - int outer_size = height / scan_size; - int inner_size = width; - // Consider the size of shared memory, here block size is 128 - dim3 scan_grid(outer_size, inner_size); - dim3 reverse_grid = scan_grid; - if (reverse) { - if (transpose) { - reverse_grid.x = scan_grid.y; - reverse_grid.y = scan_grid.x; - MatrixRowReverse<<>>( - next_in_data, next_out_data, scan_size, outer_size, inner_size); - if (!transpose) next_in_data = tmp_data; - swap_ptr(next_in_data, next_out_data); - } else { - MatrixRowReverse<<>>( - in_data, out_data, scan_size, outer_size, inner_size); - } - } - if (!transpose && !reverse) { - BlockScanKernel<<>>( - out_data, in_data, outer_size, inner_size, scan_size, exclusive); - - } else { - BlockScanKernel<<>>( - next_out_data, next_in_data, outer_size, inner_size, scan_size, - exclusive); - } - swap_ptr(next_in_data, next_out_data); - if (reverse) { - MatrixRowReverse<<>>( - next_in_data, next_out_data, scan_size, outer_size, inner_size); - swap_ptr(next_in_data, next_out_data); - } - if (transpose) { - transpose_grids.x = (height + tile_size - 1) / tile_size; - transpose_grids.y = (width + tile_size - 1) / tile_size; - MatrixTranspose<<>>( - next_out_data, next_in_data, width, height); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - cumsum, ops::CumCUDAKernel, - ops::CumCUDAKernel, - ops::CumCUDAKernel, - ops::CumCUDAKernel, - ops::CumCUDAKernel); diff --git a/paddle/fluid/operators/cumsum_op_npu.cc b/paddle/fluid/operators/cumsum_op_npu.cc index 38bf53ca0aa1a2dddca4ac2d2043de10fcdb7830..d197e4362e96976661ab891929b4503977f52ff0 100644 --- a/paddle/fluid/operators/cumsum_op_npu.cc +++ b/paddle/fluid/operators/cumsum_op_npu.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/cum_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 1ebafa54598574ae9027a4887639a2a1d27448ea..568c7982cfc7c07b9c7f840ccaa32e4025225122 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -62,7 +62,7 @@ detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc) detection_library(matrix_nms_op SRCS matrix_nms_op.cc DEPS gpc) detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu) detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc) -detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op.cu) +detection_library(yolo_box_op SRCS yolo_box_op.cc) detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu) detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu) detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc) diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h index b361bc3ab75e8ad84bbf2a353230a90e01b99b74..f170fbbe4b534ed5f6bb97508048a72ac766de90 100644 --- a/paddle/fluid/operators/detection/bbox_util.cu.h +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -23,7 +23,6 @@ limitations under the License. */ #include namespace cub = hipcub; #endif -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index ce9ac3de4e78c2aa562718719b111c9c47376bc8..860fdd01794ccc9898332f6f0d0ba4e9c3e296d6 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -23,11 +23,11 @@ namespace cub = hipcub; #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h" -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" namespace paddle { namespace operators { @@ -160,9 +160,9 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { sorted_rois.mutable_data({real_post_num, kBBoxSize}, dev_ctx.GetPlace()); Tensor sorted_batch_id; sorted_batch_id.mutable_data({real_post_num}, dev_ctx.GetPlace()); - GPUGather(dev_ctx, concat_rois, index_out_t, &sorted_rois); - GPUGather(dev_ctx, roi_batch_id_list_gpu, index_out_t, - &sorted_batch_id); + phi::funcs::GPUGather(dev_ctx, concat_rois, index_out_t, &sorted_rois); + phi::funcs::GPUGather(dev_ctx, roi_batch_id_list_gpu, index_out_t, + &sorted_batch_id); Tensor batch_index_t; int* batch_idx_in = @@ -190,7 +190,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { out_id_data, batch_idx_in, index_out_t.data(), real_post_num, 0, sizeof(int) * 8, dev_ctx.stream()); - GPUGather(dev_ctx, sorted_rois, index_out_t, fpn_rois); + phi::funcs::GPUGather(dev_ctx, sorted_rois, index_out_t, fpn_rois); Tensor length_lod; int* length_lod_data = diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h index a60f881ebf3e3bd825219dce1fb9f377d90c7a94..e5ae9a6ccbda5acbdb37d1190314c94ca4007c07 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h @@ -21,7 +21,6 @@ limitations under the License.*/ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -66,7 +65,8 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel { auto multi_layer_scores = context.MultiInput("MultiLevelScores"); - auto multi_rois_num = context.MultiInput("MultiLevelRoIsNum"); + auto multi_rois_num = + context.MultiInput("MultiLevelRoIsNum"); int num_size = multi_rois_num.size(); auto* fpn_rois = context.Output("FpnRois"); @@ -176,7 +176,7 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel { } num_per_batch.emplace_back(post_nms_topN - pre_idx); if (context.HasOutput("RoisNum")) { - auto* rois_num = context.Output("RoisNum"); + auto* rois_num = context.Output("RoisNum"); int* rois_num_data = rois_num->mutable_data({batch_size}, context.GetPlace()); for (int i = 0; i < batch_size; i++) { diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu index c117fbd70f52827a724c07213cd020d1b58cce22..7ad25e003b491294287a62433b8bf494086a87c2 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -24,9 +24,9 @@ namespace cub = hipcub; #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h" -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -193,7 +193,8 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { start = end; multi_fpn_rois[i]->mutable_data({sub_rois_num, kBoxDim}, dev_ctx.GetPlace()); - GPUGather(dev_ctx, *fpn_rois, sub_idx, multi_fpn_rois[i]); + phi::funcs::GPUGather(dev_ctx, *fpn_rois, sub_idx, + multi_fpn_rois[i]); } else { multi_fpn_rois[i]->mutable_data({sub_rois_num, kBoxDim}, dev_ctx.GetPlace()); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h index 628cbcd761186bd060fdcbd2b68fe8defec1bf17..5479e08c2a5efa96e64eca45d75af7a6a60a8862 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h @@ -20,7 +20,6 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -28,10 +27,11 @@ namespace operators { const int kBoxDim = 4; -inline std::vector GetLodFromRoisNum(const Tensor* rois_num) { +inline std::vector GetLodFromRoisNum( + const framework::Tensor* rois_num) { std::vector rois_lod; auto* rois_num_data = rois_num->data(); - Tensor cpu_tensor; + framework::Tensor cpu_tensor; if (platform::is_gpu_place(rois_num->place())) { paddle::framework::TensorCopySync(*rois_num, platform::CPUPlace(), &cpu_tensor); @@ -93,7 +93,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel { std::vector fpn_rois_lod; int fpn_rois_num; if (context.HasInput("RoisNum")) { - auto* rois_num = context.Input("RoisNum"); + auto* rois_num = context.Input("RoisNum"); fpn_rois_lod = GetLodFromRoisNum(rois_num); } else { fpn_rois_lod = fpn_rois->lod().back(); @@ -105,7 +105,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel { std::vector num_rois_level(num_level, 0); std::vector num_rois_level_integral(num_level + 1, 0); for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) { - Tensor fpn_rois_slice = + auto fpn_rois_slice = fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]); const T* rois_data = fpn_rois_slice.data(); for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) { @@ -140,7 +140,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel { std::vector restore_index_inter(fpn_rois_num, -1); // distribute the rois into different fpn level by target level for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) { - Tensor fpn_rois_slice = + auto fpn_rois_slice = fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]); const T* rois_data = fpn_rois_slice.data(); size_t cur_offset = fpn_rois_lod[i]; @@ -163,7 +163,8 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel { for (int i = 0; i < fpn_rois_num; ++i) { restore_index_data[restore_index_inter[i]] = i; } - auto multi_rois_num = context.MultiOutput("MultiLevelRoIsNum"); + auto multi_rois_num = + context.MultiOutput("MultiLevelRoIsNum"); if (multi_rois_num.size() > 0) { int batch_size = fpn_rois_lod.size() - 1; for (int i = 0; i < num_level; ++i) { diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc index e6af1a5bbf71cf24cd355dc09cb439e0bc9fbfba..c9cc4e722071c69f0bf658ad69363dbdd75b63e4 100644 --- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc @@ -17,7 +17,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/detection/mask_util.h" -#include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc index 424aa0714400d3c8a897f98b9209222aa61acef8..cbf17048400bfd967e311897bf8d6d6e11d6000b 100644 --- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" -#include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/math/concat_and_split.h" +#include "paddle/phi/kernels/funcs/gather.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -281,22 +281,22 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context, Tensor fg_boxes, bg_boxes, fg_labels, bg_labels; fg_boxes.mutable_data({fg_num, kBoxDim}, context.GetPlace()); - CPUGather(context, boxes, fg_inds_t, &fg_boxes); + phi::funcs::CPUGather(context, boxes, fg_inds_t, &fg_boxes); bg_boxes.mutable_data({bg_num, kBoxDim}, context.GetPlace()); - CPUGather(context, boxes, bg_inds_t, &bg_boxes); + phi::funcs::CPUGather(context, boxes, bg_inds_t, &bg_boxes); Concat(context, fg_boxes, bg_boxes, sampled_boxes); - CPUGather(context, gt_boxes, gt_box_inds_t, sampled_gts); + phi::funcs::CPUGather(context, gt_boxes, gt_box_inds_t, sampled_gts); fg_labels.mutable_data({fg_num}, context.GetPlace()); - CPUGather(context, gt_classes, gt_label_inds_t, &fg_labels); + phi::funcs::CPUGather(context, gt_classes, gt_label_inds_t, &fg_labels); bg_labels.mutable_data({bg_num}, context.GetPlace()); phi::funcs::set_constant(context, &bg_labels, 0); Concat(context, fg_labels, bg_labels, sampled_labels); Tensor fg_max_overlap, bg_max_overlap; fg_max_overlap.mutable_data({fg_num}, context.GetPlace()); - CPUGather(context, max_overlap, fg_inds_t, &fg_max_overlap); + phi::funcs::CPUGather(context, max_overlap, fg_inds_t, &fg_max_overlap); bg_max_overlap.mutable_data({bg_num}, context.GetPlace()); - CPUGather(context, max_overlap, bg_inds_t, &bg_max_overlap); + phi::funcs::CPUGather(context, max_overlap, bg_inds_t, &bg_max_overlap); Concat(context, fg_max_overlap, bg_max_overlap, sampled_max_overlap); } @@ -334,7 +334,7 @@ std::vector SampleRoisForOneImage( } else { proposals_num = keep.numel(); roi_filter.mutable_data({proposals_num, kBoxDim}, context.GetPlace()); - CPUGather(context, rpn_rois, keep, &roi_filter); + phi::funcs::CPUGather(context, rpn_rois, keep, &roi_filter); } T* roi_filter_dt = roi_filter.data(); memcpy(rpn_rois_dt, roi_filter_dt, roi_filter.numel() * sizeof(T)); diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index 8c4bd4ac61320356073107b7a109e3c27d6b41a1..d6130823271f05c83e590d28b41c3baf73e054f0 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/detection/nms_util.h" -#include "paddle/fluid/operators/gather.h" +#include "paddle/phi/kernels/funcs/gather.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -196,10 +196,10 @@ class GenerateProposalsKernel : public framework::OpKernel { anchor_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); var_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); - CPUGather(ctx, scores_slice, index_t, &scores_sel); - CPUGather(ctx, bbox_deltas_slice, index_t, &bbox_sel); - CPUGather(ctx, anchors, index_t, &anchor_sel); - CPUGather(ctx, variances, index_t, &var_sel); + phi::funcs::CPUGather(ctx, scores_slice, index_t, &scores_sel); + phi::funcs::CPUGather(ctx, bbox_deltas_slice, index_t, &bbox_sel); + phi::funcs::CPUGather(ctx, anchors, index_t, &anchor_sel); + phi::funcs::CPUGather(ctx, variances, index_t, &var_sel); Tensor proposals; proposals.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); @@ -223,8 +223,8 @@ class GenerateProposalsKernel : public framework::OpKernel { Tensor scores_filter; bbox_sel.mutable_data({keep.numel(), 4}, ctx.GetPlace()); scores_filter.mutable_data({keep.numel(), 1}, ctx.GetPlace()); - CPUGather(ctx, proposals, keep, &bbox_sel); - CPUGather(ctx, scores_sel, keep, &scores_filter); + phi::funcs::CPUGather(ctx, proposals, keep, &bbox_sel); + phi::funcs::CPUGather(ctx, scores_sel, keep, &scores_filter); if (nms_thresh <= 0) { return std::make_pair(bbox_sel, scores_filter); } @@ -237,8 +237,8 @@ class GenerateProposalsKernel : public framework::OpKernel { proposals.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); scores_sel.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); - CPUGather(ctx, bbox_sel, keep_nms, &proposals); - CPUGather(ctx, scores_filter, keep_nms, &scores_sel); + phi::funcs::CPUGather(ctx, bbox_sel, keep_nms, &proposals); + phi::funcs::CPUGather(ctx, scores_filter, keep_nms, &scores_sel); return std::make_pair(proposals, scores_sel); } diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu index 6e3c322c1748353d4f447dd6a927e13c4d04025c..5fb7973fd89e49f1cc19458059bffe0dadb9aa3e 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_op.cu @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/detection/bbox_util.cu.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -85,8 +86,8 @@ static std::pair ProposalForOneImage( } proposals_filter.mutable_data({keep_num, 4}, ctx.GetPlace()); scores_filter.mutable_data({keep_num, 1}, ctx.GetPlace()); - GPUGather(ctx, proposals, keep_index, &proposals_filter); - GPUGather(ctx, scores_sort, keep_index, &scores_filter); + phi::funcs::GPUGather(ctx, proposals, keep_index, &proposals_filter); + phi::funcs::GPUGather(ctx, scores_sort, keep_index, &scores_filter); if (nms_thresh <= 0) { return std::make_pair(proposals_filter, scores_filter); @@ -102,8 +103,8 @@ static std::pair ProposalForOneImage( Tensor scores_nms, proposals_nms; proposals_nms.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); scores_nms.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); - GPUGather(ctx, proposals_filter, keep_nms, &proposals_nms); - GPUGather(ctx, scores_filter, keep_nms, &scores_nms); + phi::funcs::GPUGather(ctx, proposals_filter, keep_nms, &proposals_nms); + phi::funcs::GPUGather(ctx, scores_filter, keep_nms, &scores_nms); return std::make_pair(proposals_nms, scores_nms); } diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc index 6351ea865cd0eb3891f2b4882a587b2feeb6c67a..1f1802574c5b82281b0a7ecc79d9057df61c37e6 100644 --- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/detection/nms_util.h" -#include "paddle/fluid/operators/gather.h" +#include "paddle/phi/kernels/funcs/gather.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -197,10 +197,10 @@ class GenerateProposalsV2Kernel : public framework::OpKernel { anchor_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); var_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); - CPUGather(ctx, scores_slice, index_t, &scores_sel); - CPUGather(ctx, bbox_deltas_slice, index_t, &bbox_sel); - CPUGather(ctx, anchors, index_t, &anchor_sel); - CPUGather(ctx, variances, index_t, &var_sel); + phi::funcs::CPUGather(ctx, scores_slice, index_t, &scores_sel); + phi::funcs::CPUGather(ctx, bbox_deltas_slice, index_t, &bbox_sel); + phi::funcs::CPUGather(ctx, anchors, index_t, &anchor_sel); + phi::funcs::CPUGather(ctx, variances, index_t, &var_sel); Tensor proposals; proposals.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); @@ -227,8 +227,8 @@ class GenerateProposalsV2Kernel : public framework::OpKernel { Tensor scores_filter; bbox_sel.mutable_data({keep.numel(), 4}, ctx.GetPlace()); scores_filter.mutable_data({keep.numel(), 1}, ctx.GetPlace()); - CPUGather(ctx, proposals, keep, &bbox_sel); - CPUGather(ctx, scores_sel, keep, &scores_filter); + phi::funcs::CPUGather(ctx, proposals, keep, &bbox_sel); + phi::funcs::CPUGather(ctx, scores_sel, keep, &scores_filter); if (nms_thresh <= 0) { return std::make_pair(bbox_sel, scores_filter); } @@ -242,8 +242,8 @@ class GenerateProposalsV2Kernel : public framework::OpKernel { proposals.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); scores_sel.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); - CPUGather(ctx, bbox_sel, keep_nms, &proposals); - CPUGather(ctx, scores_filter, keep_nms, &scores_sel); + phi::funcs::CPUGather(ctx, bbox_sel, keep_nms, &proposals); + phi::funcs::CPUGather(ctx, scores_filter, keep_nms, &scores_sel); return std::make_pair(proposals, scores_sel); } diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu index 93ba3deca5fc4f1b0247f90f21936faaaf9c0b43..005309e8ee577119fd295126c40b46a11a762497 100644 --- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/detection/bbox_util.cu.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -86,8 +87,8 @@ static std::pair ProposalForOneImage( } proposals_filter.mutable_data({keep_num, 4}, ctx.GetPlace()); scores_filter.mutable_data({keep_num, 1}, ctx.GetPlace()); - GPUGather(ctx, proposals, keep_index, &proposals_filter); - GPUGather(ctx, scores_sort, keep_index, &scores_filter); + phi::funcs::GPUGather(ctx, proposals, keep_index, &proposals_filter); + phi::funcs::GPUGather(ctx, scores_sort, keep_index, &scores_filter); if (nms_thresh <= 0) { return std::make_pair(proposals_filter, scores_filter); @@ -104,8 +105,8 @@ static std::pair ProposalForOneImage( Tensor scores_nms, proposals_nms; proposals_nms.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); scores_nms.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); - GPUGather(ctx, proposals_filter, keep_nms, &proposals_nms); - GPUGather(ctx, scores_filter, keep_nms, &scores_nms); + phi::funcs::GPUGather(ctx, proposals_filter, keep_nms, &proposals_nms); + phi::funcs::GPUGather(ctx, scores_filter, keep_nms, &scores_nms); return std::make_pair(proposals_nms, scores_nms); } diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc index 7927410ef37862499aadf61d6e04c45af157f347..83cf6e5fd30f6bcad4870d1ebd18a50e21518dfe 100644 --- a/paddle/fluid/operators/detection/multiclass_nms_op.cc +++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc @@ -93,7 +93,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel { // Here the box_dims[0] is not the real dimension of output. // It will be rewritten in the computing kernel. if (score_size == 3) { - ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2}); + ctx->SetOutputDim("Out", {-1, box_dims[2] + 2}); } else { ctx->SetOutputDim("Out", {-1, box_dims[2] + 2}); } @@ -545,11 +545,10 @@ class MultiClassNMS2Op : public MultiClassNMSOp { void InferShape(framework::InferShapeContext* ctx) const override { MultiClassNMSOp::InferShape(ctx); - auto box_dims = ctx->GetInputDim("BBoxes"); auto score_dims = ctx->GetInputDim("Scores"); auto score_size = score_dims.size(); if (score_size == 3) { - ctx->SetOutputDim("Index", {box_dims[1], 1}); + ctx->SetOutputDim("Index", {-1, 1}); } else { ctx->SetOutputDim("Index", {-1, 1}); } diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc index 48b0d511d902ce96e39c392cab661e19fa31f875..0d9fbf612f73c428fb8050fcfcc319ddafabe482 100644 --- a/paddle/fluid/operators/detection/yolo_box_op.cc +++ b/paddle/fluid/operators/detection/yolo_box_op.cc @@ -9,7 +9,6 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/detection/yolo_box_op.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -102,7 +101,12 @@ class YoloBoxOp : public framework::OperatorWithKernel { "But received class_num (%s)", class_num)); - int box_num = dim_x[2] * dim_x[3] * anchor_num; + int box_num; + if ((dim_x[2] > 0 && dim_x[3] > 0) || ctx->IsRuntime()) { + box_num = dim_x[2] * dim_x[3] * anchor_num; + } else { + box_num = -1; + } std::vector dim_boxes({dim_x[0], box_num, 4}); ctx->SetOutputDim("Boxes", phi::make_ddim(dim_boxes)); @@ -235,8 +239,6 @@ REGISTER_OPERATOR( yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel, - ops::YoloBoxKernel); REGISTER_OP_VERSION(yolo_box) .AddCheckpoint( diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu deleted file mode 100644 index fb5c214a59e1274ffc30226bf49a068df960f414..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/detection/yolo_box_op.cu +++ /dev/null @@ -1,143 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/operators/detection/yolo_box_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/phi/kernels/funcs/math_function.h" -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -__global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes, - T* scores, const float conf_thresh, - const int* anchors, const int n, const int h, - const int w, const int an_num, const int class_num, - const int box_num, int input_size_h, - int input_size_w, bool clip_bbox, const float scale, - const float bias, bool iou_aware, - const float iou_aware_factor) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - T box[4]; - for (; tid < n * box_num; tid += stride) { - int grid_num = h * w; - int i = tid / box_num; - int j = (tid % box_num) / grid_num; - int k = (tid % grid_num) / w; - int l = tid % w; - - int an_stride = (5 + class_num) * grid_num; - int img_height = imgsize[2 * i]; - int img_width = imgsize[2 * i + 1]; - - int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4, - iou_aware); - T conf = sigmoid(input[obj_idx]); - if (iou_aware) { - int iou_idx = GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num); - T iou = sigmoid(input[iou_idx]); - conf = pow(conf, static_cast(1. - iou_aware_factor)) * - pow(iou, static_cast(iou_aware_factor)); - } - if (conf < conf_thresh) { - continue; - } - - int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0, - iou_aware); - GetYoloBox(box, input, anchors, l, k, j, h, w, input_size_h, - input_size_w, box_idx, grid_num, img_height, img_width, scale, - bias); - box_idx = (i * box_num + j * grid_num + k * w + l) * 4; - CalcDetectionBox(boxes, box, box_idx, img_height, img_width, clip_bbox); - - int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, - 5, iou_aware); - int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num; - CalcLabelScore(scores, input, label_idx, score_idx, class_num, conf, - grid_num); - } -} - -template -class YoloBoxOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* img_size = ctx.Input("ImgSize"); - auto* boxes = ctx.Output("Boxes"); - auto* scores = ctx.Output("Scores"); - - auto anchors = ctx.Attr>("anchors"); - int class_num = ctx.Attr("class_num"); - float conf_thresh = ctx.Attr("conf_thresh"); - int downsample_ratio = ctx.Attr("downsample_ratio"); - bool clip_bbox = ctx.Attr("clip_bbox"); - bool iou_aware = ctx.Attr("iou_aware"); - float iou_aware_factor = ctx.Attr("iou_aware_factor"); - float scale = ctx.Attr("scale_x_y"); - float bias = -0.5 * (scale - 1.); - - const int n = input->dims()[0]; - const int h = input->dims()[2]; - const int w = input->dims()[3]; - const int box_num = boxes->dims()[1]; - const int an_num = anchors.size() / 2; - int input_size_h = downsample_ratio * h; - int input_size_w = downsample_ratio * w; - - auto& dev_ctx = ctx.cuda_device_context(); - int bytes = sizeof(int) * anchors.size(); - auto anchors_ptr = memory::Alloc(dev_ctx, sizeof(int) * anchors.size()); - int* anchors_data = reinterpret_cast(anchors_ptr->ptr()); - const auto gplace = ctx.GetPlace(); - const auto cplace = platform::CPUPlace(); - memory::Copy(gplace, anchors_data, cplace, anchors.data(), bytes, - dev_ctx.stream()); - - const T* input_data = input->data(); - const int* imgsize_data = img_size->data(); - T* boxes_data = boxes->mutable_data({n, box_num, 4}, ctx.GetPlace()); - T* scores_data = - scores->mutable_data({n, box_num, class_num}, ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, boxes, static_cast(0)); - set_zero(dev_ctx, scores, static_cast(0)); - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), n * box_num); - - dim3 thread_num = config.thread_per_block; -#ifdef WITH_NV_JETSON - if (config.compute_capability == 53 || config.compute_capability == 62) { - thread_num = 512; - } -#endif - - KeYoloBoxFw<<>>( - input_data, imgsize_data, boxes_data, scores_data, conf_thresh, - anchors_data, n, h, w, an_num, class_num, box_num, input_size_h, - input_size_w, clip_bbox, scale, bias, iou_aware, iou_aware_factor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(yolo_box, ops::YoloBoxOpCUDAKernel, - ops::YoloBoxOpCUDAKernel); diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h deleted file mode 100644 index 2cd69c60b7c44d0557c23b8d1bd933650e8402c3..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/detection/yolo_box_op.h +++ /dev/null @@ -1,180 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/hostdevice.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -HOSTDEVICE inline T sigmoid(T x) { - return 1.0 / (1.0 + std::exp(-x)); -} - -template -HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i, - int j, int an_idx, int grid_size_h, - int grid_size_w, int input_size_h, - int input_size_w, int index, int stride, - int img_height, int img_width, float scale, - float bias) { - box[0] = (i + sigmoid(x[index]) * scale + bias) * img_width / grid_size_w; - box[1] = (j + sigmoid(x[index + stride]) * scale + bias) * img_height / - grid_size_h; - box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width / - input_size_w; - box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] * - img_height / input_size_h; -} - -HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx, - int an_num, int an_stride, int stride, - int entry, bool iou_aware) { - if (iou_aware) { - return (batch * an_num + an_idx) * an_stride + - (batch * an_num + an_num + entry) * stride + hw_idx; - } else { - return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; - } -} - -HOSTDEVICE inline int GetIoUIndex(int batch, int an_idx, int hw_idx, int an_num, - int an_stride, int stride) { - return batch * an_num * an_stride + (batch * an_num + an_idx) * stride + - hw_idx; -} - -template -HOSTDEVICE inline void CalcDetectionBox(T* boxes, T* box, const int box_idx, - const int img_height, - const int img_width, bool clip_bbox) { - boxes[box_idx] = box[0] - box[2] / 2; - boxes[box_idx + 1] = box[1] - box[3] / 2; - boxes[box_idx + 2] = box[0] + box[2] / 2; - boxes[box_idx + 3] = box[1] + box[3] / 2; - - if (clip_bbox) { - boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast(0); - boxes[box_idx + 1] = - boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast(0); - boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1 - ? boxes[box_idx + 2] - : static_cast(img_width - 1); - boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1 - ? boxes[box_idx + 3] - : static_cast(img_height - 1); - } -} - -template -HOSTDEVICE inline void CalcLabelScore(T* scores, const T* input, - const int label_idx, const int score_idx, - const int class_num, const T conf, - const int stride) { - for (int i = 0; i < class_num; i++) { - scores[score_idx + i] = conf * sigmoid(input[label_idx + i * stride]); - } -} - -template -class YoloBoxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* imgsize = ctx.Input("ImgSize"); - auto* boxes = ctx.Output("Boxes"); - auto* scores = ctx.Output("Scores"); - auto anchors = ctx.Attr>("anchors"); - int class_num = ctx.Attr("class_num"); - float conf_thresh = ctx.Attr("conf_thresh"); - int downsample_ratio = ctx.Attr("downsample_ratio"); - bool clip_bbox = ctx.Attr("clip_bbox"); - bool iou_aware = ctx.Attr("iou_aware"); - float iou_aware_factor = ctx.Attr("iou_aware_factor"); - float scale = ctx.Attr("scale_x_y"); - float bias = -0.5 * (scale - 1.); - - const int n = input->dims()[0]; - const int h = input->dims()[2]; - const int w = input->dims()[3]; - const int box_num = boxes->dims()[1]; - const int an_num = anchors.size() / 2; - int input_size_h = downsample_ratio * h; - int input_size_w = downsample_ratio * w; - - const int stride = h * w; - const int an_stride = (class_num + 5) * stride; - - Tensor anchors_; - auto anchors_data = - anchors_.mutable_data({an_num * 2}, ctx.GetPlace()); - std::copy(anchors.begin(), anchors.end(), anchors_data); - - const T* input_data = input->data(); - const int* imgsize_data = imgsize->data(); - T* boxes_data = boxes->mutable_data({n, box_num, 4}, ctx.GetPlace()); - memset(boxes_data, 0, boxes->numel() * sizeof(T)); - T* scores_data = - scores->mutable_data({n, box_num, class_num}, ctx.GetPlace()); - memset(scores_data, 0, scores->numel() * sizeof(T)); - - T box[4]; - for (int i = 0; i < n; i++) { - int img_height = imgsize_data[2 * i]; - int img_width = imgsize_data[2 * i + 1]; - - for (int j = 0; j < an_num; j++) { - for (int k = 0; k < h; k++) { - for (int l = 0; l < w; l++) { - int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, - stride, 4, iou_aware); - T conf = sigmoid(input_data[obj_idx]); - if (iou_aware) { - int iou_idx = - GetIoUIndex(i, j, k * w + l, an_num, an_stride, stride); - T iou = sigmoid(input_data[iou_idx]); - conf = pow(conf, static_cast(1. - iou_aware_factor)) * - pow(iou, static_cast(iou_aware_factor)); - } - if (conf < conf_thresh) { - continue; - } - - int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, - stride, 0, iou_aware); - GetYoloBox(box, input_data, anchors_data, l, k, j, h, w, - input_size_h, input_size_w, box_idx, stride, - img_height, img_width, scale, bias); - box_idx = (i * box_num + j * stride + k * w + l) * 4; - CalcDetectionBox(boxes_data, box, box_idx, img_height, img_width, - clip_bbox); - - int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, - stride, 5, iou_aware); - int score_idx = (i * box_num + j * stride + k * w + l) * class_num; - CalcLabelScore(scores_data, input_data, label_idx, score_idx, - class_num, conf, stride); - } - } - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h index 375ef4344f4741c947ef3134696d64cdae696780..f89ecd37222870f73d00870c9454bf5590d504e3 100644 --- a/paddle/fluid/operators/determinant_op.h +++ b/paddle/fluid/operators/determinant_op.h @@ -19,11 +19,17 @@ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/matrix_inverse.h" -#include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/diag_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/matrix_inverse.h" +#include "paddle/phi/kernels/funcs/unsqueeze.h" +#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/matmul_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" namespace paddle { namespace operators { @@ -172,7 +178,7 @@ template class DeterminantGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = context.template device_context(); + auto& orig_dev_ctx = context.template device_context(); const auto* input = context.Input("Input"); const auto* det = context.Input("Out"); const auto* grad = @@ -200,15 +206,18 @@ class DeterminantGradKernel : public framework::OpKernel { // checked in forward, pass } + auto& dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE&>( + orig_dev_ctx); + // Check Whether the matrix is invertible // (matrix A not invertible) == (det(A)=0) if (!CheckMatrixInvertible(context, det)) { // The matrix is not invertible VLOG(3) << "The input matrix not invertible!"; ddet->Resize(input->dims()); - ddet->mutable_data(context.GetPlace()); - phi::funcs::SetConstant zero; - zero(dev_ctx, ddet, static_cast(0.0f)); + phi::Full(dev_ctx, phi::vectorize(input->dims()), static_cast(0.0f), + ddet); return; } @@ -218,35 +227,35 @@ class DeterminantGradKernel : public framework::OpKernel { // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2, // -1) - math::DeviceIndependenceTensorOperations helper(context); - // First: inverse(A) framework::Tensor inverse_A; // A must be square matrices! inverse_A.Resize(input->dims()); inverse_A.mutable_data(context.GetPlace()); - math::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, *input, &inverse_A); + phi::funcs::MatrixInverseFunctor mat_inv; + mat_inv(orig_dev_ctx, *input, &inverse_A); VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); // Second: inverse(A).transpose(-2, -1) - framework::Tensor transpose_inverse_A = helper.Transpose(inverse_A); + framework::Tensor transpose_inverse_A = + phi::TransposeLast2Dim(dev_ctx, inverse_A); + VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: " << transpose_inverse_A.dims(); // Third: dA * |A| - auto mul_dA_detA = helper.Mul(*grad, *det); + auto mul_dA_detA = phi::Multiply(dev_ctx, *grad, *det); VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims(); // Fourth: unsqueeze(dA * |A|, [-1, -2]) - auto unsqueeze1 = helper.Unsqueeze(mul_dA_detA, -1); - auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2); + auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1); + auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2); VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims(); // Finally: unsqueeze(dA * |A|) * inverse(A) - auto res = helper.Mul(unsqueeze2, transpose_inverse_A); + auto res = phi::Multiply(dev_ctx, unsqueeze2, transpose_inverse_A); VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims(); @@ -331,7 +340,7 @@ template class SlogDeterminantGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = context.template device_context(); + auto& orig_dev_ctx = context.template device_context(); const auto* input = context.Input("Input"); const auto* slogdet = context.Input("Out"); const auto* grad = @@ -353,6 +362,10 @@ class SlogDeterminantGradKernel : public framework::OpKernel { input->dims().size() - grad->dims().size())); } + auto& dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE&>( + orig_dev_ctx); + // Check Whether the matrix is invertible // (matrix A not invertible) == (absslogdet(A)=0) auto slogdet_vec = slogdet->Split(1, 0); @@ -361,9 +374,8 @@ class SlogDeterminantGradKernel : public framework::OpKernel { // The matrix is not invertible VLOG(3) << "The input matrix not invertible!"; dslogdet->Resize(input->dims()); - dslogdet->mutable_data(context.GetPlace()); - phi::funcs::SetConstant zero; - zero(dev_ctx, dslogdet, std::numeric_limits::quiet_NaN()); + phi::Full(dev_ctx, phi::vectorize(input->dims()), + std::numeric_limits::quiet_NaN(), dslogdet); return; } @@ -373,34 +385,25 @@ class SlogDeterminantGradKernel : public framework::OpKernel { // we set dsl|A| = unsqueeze(dslA, [-1, -2]) * // inverse(A).conj().transpose(-2, -1) - math::DeviceIndependenceTensorOperations helper(context); - // First: inverse(A) framework::Tensor inverse_A; // A must be square matrices! inverse_A.Resize(input->dims()); inverse_A.mutable_data(context.GetPlace()); - math::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, *input, &inverse_A); + phi::funcs::MatrixInverseFunctor mat_inv; + mat_inv(orig_dev_ctx, *input, &inverse_A); VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); // Second: inverse(A).conj() - framework::Tensor conj_inverse_A; - conj_inverse_A.Resize(inverse_A.dims()); - auto numel = input->numel(); - auto* conj_data = conj_inverse_A.mutable_data(context.GetPlace(), - size_t(numel * sizeof(T))); - - platform::ForRange for_range(dev_ctx, numel); - phi::funcs::ConjFunctor functor(inverse_A.data(), numel, conj_data); - for_range(functor); + auto conj_inverse_A = phi::Conj(dev_ctx, inverse_A); VLOG(3) << "inverse(A).conj() dims: " << conj_inverse_A.dims(); // Third: inverse(A).conj().transpose(-2, -1) - framework::Tensor transpose_inverse_A = helper.Transpose(conj_inverse_A); + framework::Tensor transpose_inverse_A = + phi::TransposeLast2Dim(dev_ctx, conj_inverse_A); VLOG(3) << "inverse(A).conj().transpose(-2, -1) dims: " << transpose_inverse_A.dims(); @@ -417,12 +420,12 @@ class SlogDeterminantGradKernel : public framework::OpKernel { det_grad.Resize(det_grad.dims().reshape(det_grad_vec)); // Fifth: unsqueeze(dslA, [-1, -2]) - auto unsqueeze1 = helper.Unsqueeze(det_grad, -1); - auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2); + auto unsqueeze1 = phi::funcs::Unsqueeze(det_grad, -1); + auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2); VLOG(3) << "unsqueezed(dslA, [-1, -2]) dims: " << unsqueeze2.dims(); // Finally: unsqueeze(dslA) * inverse(A) - auto res = helper.Mul(unsqueeze2, transpose_inverse_A); + auto res = phi::Multiply(dev_ctx, unsqueeze2, transpose_inverse_A); VLOG(3) << "unsqueeze(dslA) * inverse(A) dims: " << res.dims(); framework::TensorCopy(res, context.GetPlace(), dslogdet); diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc index 0160277dc79af50c555b1257e6ffa216b7b56b62..93fbff67e220bcf7d1f8dab112a07cc42649595f 100644 --- a/paddle/fluid/operators/diag_v2_op.cc +++ b/paddle/fluid/operators/diag_v2_op.cc @@ -62,8 +62,8 @@ class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor, - PT_INFER_META(phi::DiagInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor, + PD_INFER_META(phi::DiagInferMeta)); REGISTER_OPERATOR( diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker, diff --git a/paddle/fluid/operators/diagonal_op.cc b/paddle/fluid/operators/diagonal_op.cc index b419f629a1e635c5a463b732af3003e93a5674d6..bf3cc941539eaeb2e03f53eb2465532469be5697 100644 --- a/paddle/fluid/operators/diagonal_op.cc +++ b/paddle/fluid/operators/diagonal_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -20,74 +23,6 @@ namespace operators { class DiagonalOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "diagonal"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "diagonal"); - - int offset_ = ctx->Attrs().Get("offset"); - int axis1 = ctx->Attrs().Get("axis1"); - int axis2 = ctx->Attrs().Get("axis2"); - - auto x_dims = ctx->GetInputDim("Input"); - int axis1_ = axis1 < 0 ? x_dims.size() + axis1 : axis1; - int axis2_ = axis2 < 0 ? x_dims.size() + axis2 : axis2; - - PADDLE_ENFORCE_GE( - x_dims.size(), 2, - platform::errors::OutOfRange("Input's dim is out of range (expected at " - "least 2 dimensions, but got %ld).", - x_dims.size())); - PADDLE_ENFORCE_LT( - axis1_, x_dims.size(), - platform::errors::OutOfRange( - "Attr(axis1) is out of range (expected to be in range of [%ld, " - "%ld], but got %ld).", - -(x_dims.size()), (x_dims.size() - 1), axis1)); - PADDLE_ENFORCE_LT( - axis2_, x_dims.size(), - platform::errors::OutOfRange( - "Attr(axis2) is out of range (expected to be in range of [%ld, " - "%ld], but got %ld).", - -(x_dims.size()), (x_dims.size() - 1), axis2)); - PADDLE_ENFORCE_NE(axis1_, axis2_, - platform::errors::InvalidArgument( - "The dimensions should not be identical " - "%d vs %d.", - axis1, axis2)); - - auto out_dims = vectorize(x_dims); - // from out_dims get the dim size of axis1_. - auto axis1_size = out_dims[axis1_]; - auto axis2_size = out_dims[axis2_]; - // delete two dims by attr axis1 and axis2 from out_dims. - /* example: - out_dim = [2, 3, 4]; - axis1 = 0; - axis2 = 1; - according to the attr of axis1 and axis2, we get: - out_dim = [4]. - */ - out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_)); - out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_)); - - if (offset_ == 0) { - out_dims.push_back(std::min(axis1_size, axis2_size)); - } else if (offset_ > 0) { - if ((axis2_size - offset_) > 0) { - out_dims.push_back(std::min(axis1_size, axis2_size - offset_)); - } else { - out_dims.push_back(0); - } - } else { - if ((axis1_size + offset_) > 0) { - out_dims.push_back(std::min(axis1_size + offset_, axis2_size)); - } else { - out_dims.push_back(0); - } - } - ctx->SetOutputDim("Out", phi::make_ddim(out_dims)); - } }; class DiagonalOpMaker : public framework::OpProtoAndCheckerMaker { @@ -170,9 +105,13 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagonalGradNoNeedBufferVarsInferer, namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(diagonal, DiagonalInferShapeFunctor, + PD_INFER_META(phi::DiagonalInferMeta)); + REGISTER_OPERATOR(diagonal, ops::DiagonalOp, ops::DiagonalOpMaker, ops::DiagonalGradOpMaker, - ops::DiagonalGradOpMaker); + ops::DiagonalGradOpMaker, + DiagonalInferShapeFunctor); REGISTER_OPERATOR(diagonal_grad, ops::DiagonalGradOp, ops::DiagonalGradNoNeedBufferVarsInferer) diff --git a/paddle/fluid/operators/dist_op.cc b/paddle/fluid/operators/dist_op.cc index 3a53f1365567f99c9446077f7939d87c156c9a08..55b2484941293c8db47ef847bea959ebe82ff3ae 100644 --- a/paddle/fluid/operators/dist_op.cc +++ b/paddle/fluid/operators/dist_op.cc @@ -12,10 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/dist_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -121,13 +124,11 @@ class DistGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(dist, DistInferShapeFunctor, + PD_INFER_META(phi::DistInferMeta)); + REGISTER_OPERATOR(dist, ops::DistOp, ops::DistOpMaker, ops::DistGradOpMaker, - ops::DistGradOpMaker); + ops::DistGradOpMaker, + DistInferShapeFunctor); REGISTER_OPERATOR(dist_grad, ops::DistOpGrad); -REGISTER_OP_CPU_KERNEL( - dist, ops::DistKernel, - ops::DistKernel); -REGISTER_OP_CPU_KERNEL( - dist_grad, ops::DistGradKernel, - ops::DistGradKernel) diff --git a/paddle/fluid/operators/dist_op.cu b/paddle/fluid/operators/dist_op.cu deleted file mode 100644 index 90674969e283f1cba816ad46802cdbf971bcc555..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/dist_op.cu +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/dist_op.h" - -namespace ops = paddle::operators; -#ifdef PADDLE_WITH_HIP -// Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922 -// do not support double in HIPCC platform (Eigen3 to be fixed) -REGISTER_OP_CUDA_KERNEL( - dist, ops::DistKernel); -REGISTER_OP_CUDA_KERNEL( - dist_grad, ops::DistGradKernel); -#else -REGISTER_OP_CUDA_KERNEL( - dist, ops::DistKernel, - ops::DistKernel); -REGISTER_OP_CUDA_KERNEL( - dist_grad, ops::DistGradKernel, - ops::DistGradKernel); -#endif diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h deleted file mode 100644 index dfd7e29a8d0102261746ab47d3e1e805a674d7b1..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/dist_op.h +++ /dev/null @@ -1,304 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -using EigenTensor = framework::EigenTensor; -using framework::Tensor; - -template -static void GetBraodcastDims(const framework::DDim& x_dims, - const framework::DDim& y_dims, - Eigen::DSizes* x_bcast_dims, - Eigen::DSizes* y_bcast_dims) { - int bcast_dims_remainder = 0; - for (int i = 0; i < x_dims.size(); ++i) { - if (x_dims[i] >= y_dims[i]) { - (*x_bcast_dims)[i] = 1; - (*y_bcast_dims)[i] = x_dims[i] / y_dims[i]; - bcast_dims_remainder += x_dims[i] % y_dims[i]; - } else { - (*y_bcast_dims)[i] = 1; - (*x_bcast_dims)[i] = y_dims[i] / x_dims[i]; - bcast_dims_remainder += y_dims[i] % x_dims[i]; - } - } - PADDLE_ENFORCE_EQ(bcast_dims_remainder, 0, - platform::errors::PreconditionNotMet( - "The input tensor of Op(dist) could not be broadcast, " - "X's shape is [%s], Y's shape is [%s].", - x_dims, y_dims)); -} - -static framework::DDim GetNewDims(const framework::DDim& in_dims, int rank) { - std::vector new_dims_vec(rank); - if (in_dims.size() < rank) { - for (int i = 0; i < rank - in_dims.size(); ++i) { - new_dims_vec[i] = 1; - } - for (int i = 0; i < in_dims.size(); ++i) { - new_dims_vec[i + rank - in_dims.size()] = in_dims[i]; - } - } else { - new_dims_vec = vectorize(in_dims); - } - return phi::make_ddim(new_dims_vec); -} - -template -static void DistFunction(const framework::ExecutionContext& context) { - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* out = context.Output("Out"); - auto p = context.Attr("p"); - out->mutable_data(context.GetPlace()); - - auto x_dims = context.Input("X")->dims(); - auto y_dims = context.Input("Y")->dims(); - - // new dims with same size as rank, e.g. (rank=3, (4, 3) => (1, 4, 3)) - framework::DDim x_new_dims = GetNewDims(x_dims, Rank); - framework::DDim y_new_dims = GetNewDims(y_dims, Rank); - - auto x_t = EigenTensor::From(*x, x_new_dims); - auto y_t = EigenTensor::From(*y, y_new_dims); - auto out_t = EigenTensor::From(*out); - auto& place = - *context.template device_context().eigen_device(); - - Eigen::DSizes x_bcast_dims; - Eigen::DSizes y_bcast_dims; - GetBraodcastDims(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims); - // p=0 means number of non-zero elements of (x-y) - // p=inf means the maximum of |x-y| - // p=-inf means the minimum of |x-y| - // otherwise, Lp-norm = pow(sum(pow(|x-y|, p)), 1/p) - if (p == 0) { - out_t.device(place) = - (x_t.broadcast(x_bcast_dims) != y_t.broadcast(y_bcast_dims)) - .template cast() - .sum(); - } else if (p == INFINITY) { - out_t.device(place) = - (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims)) - .abs() - .maximum(); - } else if (p == -INFINITY) { - out_t.device(place) = - (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims)) - .abs() - .minimum(); - } else { - out_t.device(place) = - (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims)) - .abs() - .pow(p) - .sum() - .pow(1.0 / p); - } -} - -template -static void DistGradFunction(const framework::ExecutionContext& context) { - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* out = context.Input("Out"); - auto p = context.Attr("p"); - - auto x_grad = context.Output(framework::GradVarName("X")); - auto y_grad = context.Output(framework::GradVarName("Y")); - auto out_grad = context.Input(framework::GradVarName("Out")); - - auto x_dims = context.Input("X")->dims(); - auto y_dims = context.Input("Y")->dims(); - auto out_dims = context.Input("Out")->dims(); - - framework::DDim x_new_dims = GetNewDims(x_dims, Rank); - framework::DDim y_new_dims = GetNewDims(y_dims, Rank); - framework::DDim out_new_dims = GetNewDims(out_dims, Rank); - auto x_t = EigenTensor::From(*x, x_new_dims); - auto y_t = EigenTensor::From(*y, y_new_dims); - auto out_t = EigenTensor::From(*out, out_new_dims); - - Eigen::DSizes x_bcast_dims; - Eigen::DSizes y_bcast_dims; - Eigen::DSizes out_bcast_dims; - - GetBraodcastDims(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims); - std::vector new_dims_vec(Rank); - for (int i = 0; i < Rank; ++i) { - new_dims_vec[i] = std::max(x_new_dims[i], y_new_dims[i]); - out_bcast_dims[i] = new_dims_vec[i]; - } - framework::DDim new_dims = phi::make_ddim(new_dims_vec); - - auto& place = - *context.template device_context().eigen_device(); - auto out_grad_t = EigenTensor::From(*out_grad, out_new_dims); - framework::Tensor grad; - grad.mutable_data(new_dims, context.GetPlace()); - auto grad_t = EigenTensor::From(grad); - - auto x_minux_y = x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims); - auto x_minux_y_abs = x_minux_y.abs(); - auto sign = - (x_minux_y > static_cast(0)).template cast() * static_cast(1.0) + - (x_minux_y < static_cast(0)).template cast() * static_cast(-1.0); - T epsilon = static_cast(1.0e-10f); - - // 1: Lp-norm(z), z = x-y, compute dz - if (p == 0) { - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - set_zero(dev_ctx, &grad, static_cast(0)); - } else if (p == INFINITY || p == -INFINITY) { - // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if - // j!=i, or equals to sign(z_i) * dout if j=i. - if (platform::is_cpu_place(context.GetPlace())) { - grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims)) - .template cast() * - sign.eval() * out_grad_t.broadcast(out_bcast_dims); - } else { - grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims)) - .template cast() * - sign * out_grad_t.broadcast(out_bcast_dims); - } - } else { - // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout - if (platform::is_cpu_place(context.GetPlace())) { - grad_t.device(place) = - (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims)) - .pow(p - 1) * - sign.eval() * out_grad_t.broadcast(out_bcast_dims); - } else { - grad_t.device(place) = - (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims)) - .pow(p - 1) * - sign * out_grad_t.broadcast(out_bcast_dims); - } - } - - Eigen::DSizes x_reshape_dims; - Eigen::DSizes y_reshape_dims; - Eigen::DSizes reduce_dims; - for (int i = 0; i < x_new_dims.size(); ++i) { - x_reshape_dims[2 * i] = x_bcast_dims[i]; - x_reshape_dims[2 * i + 1] = x_new_dims[i]; - y_reshape_dims[2 * i] = y_bcast_dims[i]; - y_reshape_dims[2 * i + 1] = y_new_dims[i]; - reduce_dims[i] = 2 * i; - } - - // 2: if x or y is broadcasted in forward function, - // the grad need to be sum along the broadcasted dimensions - if (x_grad) { - x_grad->mutable_data(context.GetPlace()); - auto x_grad_t = EigenTensor::From(*x_grad, x_new_dims); - x_grad_t.device(place) = grad_t.reshape(x_reshape_dims) - .sum(reduce_dims) - .reshape(x_grad_t.dimensions()); - } - if (y_grad) { - y_grad->mutable_data(context.GetPlace()); - auto y_grad_t = EigenTensor::From(*y_grad, y_new_dims); - y_grad_t.device(place) = -grad_t.reshape(y_reshape_dims) - .sum(reduce_dims) - .reshape(y_grad_t.dimensions()); - } -} - -template -class DistKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto x_rank = context.Input("X")->dims().size(); - auto y_rank = context.Input("Y")->dims().size(); - auto rank = std::max(x_rank, y_rank); - PADDLE_ENFORCE_LE(rank, 6, - platform::errors::Unimplemented( - "Op(dist) only support tensors with no more than 6 " - "dimensions, but X's rank is %d, Y's rank is %d.", - x_rank, y_rank)); - switch (rank) { - case 1: - DistFunction(context); - break; - case 2: - DistFunction(context); - break; - case 3: - DistFunction(context); - break; - case 4: - DistFunction(context); - break; - case 5: - DistFunction(context); - break; - case 6: - DistFunction(context); - break; - } - } -}; - -template -class DistGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto x_rank = context.Input("X")->dims().size(); - auto y_rank = context.Input("Y")->dims().size(); - auto rank = std::max(x_rank, y_rank); - PADDLE_ENFORCE_LE(rank, 6, - platform::errors::Unimplemented( - "Op(dist) only support tensors with no more than 6 " - "dimensions, but X's rank is %d, Y's rank is %d.", - x_rank, y_rank)); - switch (rank) { - case 1: - DistGradFunction(context); - break; - case 2: - DistGradFunction(context); - break; - case 3: - DistGradFunction(context); - break; - case 4: - DistGradFunction(context); - break; - case 5: - DistGradFunction(context); - break; - case 6: - DistGradFunction(context); - break; - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/dot_op.cc b/paddle/fluid/operators/dot_op.cc index ed2b09796eeeb8ce18fdc47be58347d85e6e1a80..8efdd15781a6f2dab48c0680ba87c7b427dc60ec 100644 --- a/paddle/fluid/operators/dot_op.cc +++ b/paddle/fluid/operators/dot_op.cc @@ -14,6 +14,10 @@ #include "paddle/fluid/operators/dot_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -21,51 +25,6 @@ class DotOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(true, ctx->HasInput("X"), - platform::errors::PreconditionNotMet( - "Input(X) of DotOp should not be null.")); - PADDLE_ENFORCE_EQ(true, ctx->HasInput("Y"), - platform::errors::PreconditionNotMet( - "Input(Y) of DotOp should not be null.")); - PADDLE_ENFORCE_EQ(true, ctx->HasOutput("Out"), - platform::errors::PreconditionNotMet( - "Output(Out) of DotOp should not be null.")); - - auto x_dims = ctx->GetInputDim("X"); - auto x_rank = static_cast(x_dims.size()); - PADDLE_ENFORCE_EQ(true, 1 == x_rank || 2 == x_rank, - platform::errors::PreconditionNotMet( - "ShapeError: The dimensions of input tensor X (%s) " - "should be 1 or 2", - x_dims.to_str())); - - auto y_dims = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_EQ( - true, x_rank == (size_t)y_dims.size(), - platform::errors::PreconditionNotMet( - "ShapeError: The shape of input tensor Y: %s should match with " - "input tenosr X: %s", - y_dims.to_str(), x_dims.to_str())); - bool shape_match = true; - for (size_t i = 0; i < x_rank; ++i) { - if (x_dims[i] != y_dims[i]) { - shape_match = false; - break; - } - } - - PADDLE_ENFORCE_EQ(true, shape_match, - platform::errors::PreconditionNotMet( - "ShapeError: The shape of input tensor X: %s should " - "be exactly the same " - "with input tensor Y: %s", - x_dims.to_str(), y_dims.to_str())); - auto dims = vectorize(x_dims); - dims[dims.size() - 1] = 1; - ctx->SetOutputDim("Out", phi::make_ddim(dims)); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( @@ -142,9 +101,13 @@ class DotOpGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor, + PD_INFER_META(phi::DotInferMeta)); + REGISTER_OPERATOR(dot, ops::DotOp, ops::DotOpMaker, ops::DotOpGradMaker, - ops::DotOpGradMaker); + ops::DotOpGradMaker, + DotInferShapeFunctor); REGISTER_OPERATOR(dot_grad, ops::DotGradOp); diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index 2fa956a2e6515e8b6a8e1c463c8ab8d1476f8d90..144198367d538e178a745c22902bb77a65f45fe4 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -32,10 +32,9 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/dropout_impl_util.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/platform/aligned_vector.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/kernels/funcs/functors.h" namespace paddle { @@ -86,8 +85,8 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed, bool is_upscale_in_train, uint64_t increment) { using MT = typename details::MPTypeTrait::Type; - using LoadT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; #ifdef PADDLE_WITH_HIP int64_t idx = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; @@ -102,7 +101,7 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed, MT factor = static_cast(1.0f / (1.0f - dropout_prob)); for (int i = idx * VecSize; i < n; i += blockDim.x * gridDim.x * VecSize) { LoadT src_val; - platform::Load(&src[i], &src_val); + phi::Load(&src[i], &src_val); #ifdef PADDLE_WITH_HIP float4 rand = hiprand_uniform4(&state); @@ -126,8 +125,8 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed, } } - platform::Store(dst_val, &dst[i]); - platform::Store(mask_val, &mask[i]); + phi::Store(dst_val, &dst[i]); + phi::Store(mask_val, &mask[i]); } } @@ -153,16 +152,16 @@ __global__ void DropoutGradCUDAKernel( const typename details::MPTypeTrait::Type factor, const int64_t size, T* dx) { using MT = typename details::MPTypeTrait::Type; - using LoadT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) { LoadT dout_val; - platform::Load(&dout[i], &dout_val); + phi::Load(&dout[i], &dout_val); MaskLoadT mask_val; - platform::Load(&mask[i], &mask_val); + phi::Load(&mask[i], &mask_val); LoadT dx_val; @@ -172,27 +171,28 @@ __global__ void DropoutGradCUDAKernel( static_cast(mask_val[j]) * factor); } - platform::Store(dx_val, &dx[i]); + phi::Store(dx_val, &dx[i]); } } template -void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, - bool is_test, +void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test, const std::string dropout_implementation, float dropout_prob, bool upscale_in_train, - bool is_fix_seed, int seed_val, const Tensor& x, - const Tensor* seed, Tensor* mask, Tensor* y) { + bool is_fix_seed, int seed_val, + const framework::Tensor& x, + const framework::Tensor* seed, + framework::Tensor* mask, framework::Tensor* y) { auto& place = *dev_ctx.eigen_device(); + int64_t x_numel = x.numel(); + auto stream = dev_ctx.stream(); + auto* x_data = x.data(); + auto* y_data = y->data(); if (!is_test) { - int64_t x_numel = x.numel(); - auto stream = dev_ctx.stream(); auto* mask_data = mask->data(); size_t size = phi::product(mask->dims()); - auto* x_data = x.data(); - auto* y_data = y->data(); if (dropout_prob == 1.0f) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( @@ -219,8 +219,9 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, uint64_t increment; // VectorizedRandomGenerator use curand_uniform4, so we only support // vec_size is 4; - int vec_size = (platform::GetVectorizedSize(x_data) == 4) ? 4 : 1; - auto gpu_config = GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size); + int vec_size = (phi::GetVectorizedSize(x_data) == 4) ? 4 : 1; + auto gpu_config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size); auto offset = ((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size; @@ -254,22 +255,37 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, } #endif } else { - auto X = EigenMatrix::Reshape(x, 1); - auto Y = EigenMatrix::Reshape(*y, 1); if (upscale_in_train) { - Y.device(place) = X; +// todo: can y share with data with x directly? +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + hipMemcpyAsync(y_data, x_data, sizeof(T) * x_numel, + hipMemcpyDeviceToDevice, stream)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMemcpyAsync(y_data, x_data, sizeof(T) * x_numel, + cudaMemcpyDeviceToDevice, stream)); +#endif } else { - Y.device(place) = X * static_cast(1.0f - dropout_prob); + using MT = typename details::MPTypeTrait::Type; + MT factor = static_cast(1.0f - dropout_prob); + std::vector ins = {&x}; + std::vector outs = {y}; + auto functor = phi::funcs::ScaleFunctor(factor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } } } template -void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, +void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx, const std::string dropout_implementation, - float dropout_prob, const Tensor& grad_y, - const Tensor& mask, int64_t size, - Tensor* grad_x, bool is_test = false) { + float dropout_prob, + const framework::Tensor& grad_y, + const framework::Tensor& mask, int64_t size, + framework::Tensor* grad_x, + bool is_test = false) { using MT = typename details::MPTypeTrait::Type; auto stream = dev_ctx.stream(); MT factor; diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h index d7db7dddce3887ca25ea1df34048f15663b2e987..c62d45570ba291dc60120c393d21842cc6548c61 100644 --- a/paddle/fluid/operators/dropout_impl_util.h +++ b/paddle/fluid/operators/dropout_impl_util.h @@ -20,7 +20,7 @@ limitations under the License. */ namespace paddle { namespace operators { -inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx, +inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx, const framework::Tensor* seed, const bool is_fix_seed, const int seed_val, const int offset, uint64_t* seed_data, diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc index 7613b04bccfdc2084decc0b383eec199f7e10991..6d52ce45c4c10099dbeb4d4fadbf91f8c390ef46 100644 --- a/paddle/fluid/operators/dropout_op.cc +++ b/paddle/fluid/operators/dropout_op.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/dropout_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -177,14 +177,3 @@ REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker, ops::DropoutGradOpMaker, ops::DropoutGradOpMaker); REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad); -REGISTER_OP_CPU_KERNEL( - dropout, ops::CPUDropoutKernel, - ops::CPUDropoutKernel, - ops::CPUDropoutKernel); -REGISTER_OP_CPU_KERNEL( - dropout_grad, - ops::DropoutGradKernel, - ops::DropoutGradKernel, - ops::DropoutGradKernel); diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu deleted file mode 100644 index f6ddff1d0327d3c7961781f875da69f89df1edec..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/dropout_op.cu +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/dropout_impl.cu.h" -#include "paddle/fluid/operators/dropout_op.h" -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { - -// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT. -// Use std::random and thrust::random(thrust is a std library in CUDA) to -// implement uniform random. -template -class GPUDropoutKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* seed = - context.HasInput("Seed") ? context.Input("Seed") : nullptr; - auto* y = context.Output("Out"); - y->mutable_data(context.GetPlace()); - float dropout_prob = context.Attr("dropout_prob"); - - auto& dropout_implementation = - context.Attr("dropout_implementation"); - bool upscale_in_train = (dropout_implementation == "upscale_in_train"); - - bool is_test = context.Attr("is_test"); - - auto& dev_ctx = context.cuda_device_context(); - auto* mask = context.Output("Mask"); - mask->mutable_data(context.GetPlace()); - - bool is_fix_seed = context.Attr("fix_seed"); - int seed_val = context.Attr("seed"); - DropoutFwGPUKernelDriver(dev_ctx, is_test, dropout_implementation, - dropout_prob, upscale_in_train, is_fix_seed, - seed_val, *x, seed, mask, y); - } -}; - -template -class GPUDropoutGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* grad_x = context.Output(framework::GradVarName("X")); - auto* grad_y = context.Input(framework::GradVarName("Out")); - auto* mask = context.Input("Mask"); - grad_x->mutable_data(context.GetPlace()); - auto size = grad_x->numel(); - auto& dropout_implementation = - context.Attr("dropout_implementation"); - float dropout_prob = context.Attr("dropout_prob"); - - bool is_test = context.Attr("is_test"); - - auto& dev_ctx = - context.template device_context(); - DropoutGradGPUKernelDriver(dev_ctx, dropout_implementation, dropout_prob, - *grad_y, *mask, size, grad_x, is_test); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - dropout, ops::GPUDropoutKernel, - ops::GPUDropoutKernel, - ops::GPUDropoutKernel, - ops::GPUDropoutKernel); -REGISTER_OP_CUDA_KERNEL( - dropout_grad, ops::GPUDropoutGradKernel, - ops::GPUDropoutGradKernel, - ops::GPUDropoutGradKernel, - ops::GPUDropoutGradKernel); diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h deleted file mode 100644 index ea6ed0e61947470c22f18e47acce2fca4cb9c41f..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/dropout_op.h +++ /dev/null @@ -1,151 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include -#include - -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -template -using EigenMatrix = framework::EigenMatrix; - -template -using EigenVector = framework::EigenVector; - -template -class CPUDropoutKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* seed = - context.HasInput("Seed") ? context.Input("Seed") : nullptr; - auto* y = context.Output("Out"); - const auto* x_data = x->data(); - auto* y_data = y->mutable_data(context.GetPlace()); - float dropout_prob = context.Attr("dropout_prob"); - - auto& dropout_implementation = - context.Attr("dropout_implementation"); - bool upscale_in_train = (dropout_implementation == "upscale_in_train"); - if (!context.Attr("is_test")) { - auto* mask = context.Output("Mask"); - auto* mask_data = mask->mutable_data(context.GetPlace()); - size_t size = phi::product(mask->dims()); - - // Special case when dropout_prob is 1.0 - if (dropout_prob == 1.0f) { - std::memset(y_data, 0, size * sizeof(*y_data)); // NOLINT - std::memset(mask_data, 0, size * sizeof(*mask_data)); // NOLINT - return; - } - // std::minstd_rand engine; - // NOTE: fixed seed should only be used in unittest or for debug. - // Guarantee to use random seed in training. - int seed_data = 0; - if (seed) { - seed_data = *(seed->data()); - } else { - seed_data = - context.Attr("fix_seed") ? context.Attr("seed") : 0; - } - auto engine = framework::GetCPURandomEngine(seed_data); - - std::uniform_real_distribution dist(0, 1); - - for (size_t i = 0; i < size; ++i) { - if (dist(*engine) < dropout_prob) { - mask_data[i] = 0; - y_data[i] = 0; - } else { - mask_data[i] = 1; - if (upscale_in_train) { - y_data[i] = x_data[i] / static_cast(1.0f - dropout_prob); - } else { - y_data[i] = x_data[i]; - } - } - } - } else { - if (upscale_in_train) { - const auto* X_data = x->data(); - auto* Y_data = y->mutable_data(context.GetPlace()); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int i = 0; i < x->numel(); i++) { - Y_data[i] = X_data[i]; - } - } else { - auto X = EigenMatrix::Reshape(*x, 1); - auto Y = EigenMatrix::Reshape(*y, 1); - auto& place = - *context.template device_context().eigen_device(); - Y.device(place) = X * static_cast(1.0f - dropout_prob); - } - } - } -}; -template -class DropoutGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* grad_x = context.Output(framework::GradVarName("X")); - auto* grad_y = context.Input(framework::GradVarName("Out")); - auto* mask = context.Input("Mask"); - grad_x->mutable_data(context.GetPlace()); - - auto dX = EigenVector::Flatten(*grad_x); - auto dY = EigenVector::Flatten(*grad_y); - - auto& place = - *context.template device_context().eigen_device(); - auto& dropout_implementation = - context.Attr("dropout_implementation"); - if (context.Attr("is_test") == true) { - if (dropout_implementation == "upscale_in_train") { - dX.device(place) = static_cast(1) * dY; - } else { - float dropout_prob = context.Attr("dropout_prob"); - dX.device(place) = dY * static_cast(1.0f - dropout_prob); - } - } else { - auto M = EigenVector::Flatten(*mask); - if (dropout_implementation == "upscale_in_train") { - float dropout_prob = context.Attr("dropout_prob"); - if (dropout_prob == 1.0f) { - dX.device(place) = static_cast(0) * dY; - } else { - dX.device(place) = - dY * M.cast() / static_cast(1.0f - dropout_prob); - } - } else { - dX.device(place) = dY * M.cast(); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc index 6aae566760623c666f3ce82a890a119e3e173390..07b3b5381162575cbfc03dd8cc10d0c88a2d21e8 100644 --- a/paddle/fluid/operators/dropout_op_npu.cc +++ b/paddle/fluid/operators/dropout_op_npu.cc @@ -15,8 +15,8 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/phi/core/ddim.h" diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc index 206d9a6c5e9c9869216f0a6c137accc931aa2a77..bdf08646f1d8b94d6d8d141d8a9fa9864cdc937b 100644 --- a/paddle/fluid/operators/dropout_op_test.cc +++ b/paddle/fluid/operators/dropout_op_test.cc @@ -24,14 +24,13 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(dropout); +USE_OP_ITSELF(dropout); void Compare(f::Scope* scope, const p::DeviceContext& ctx) { // init diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc index 07b7e2cc7c09b09d6640f49fce438d58d0cc9cf2..7d8660f238abc8446b2988aad24a64c565e01ef9 100644 --- a/paddle/fluid/operators/dropout_op_xpu.cc +++ b/paddle/fluid/operators/dropout_op_xpu.cc @@ -8,15 +8,17 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/dropout_op.h" + #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { namespace operators { #ifdef PADDLE_WITH_XPU +using Tensor = framework::Tensor; template class DropoutXPUKernel : public framework::OpKernel { using XPUTyp = typename XPUTypeTrait::Type; diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h index 03b25c6705ac562c57cc905766dd8062ebcb741d..5e4c83e1a45ebdb96a0e764cfa2d3997442ae1ea 100644 --- a/paddle/fluid/operators/eig_op.h +++ b/paddle/fluid/operators/eig_op.h @@ -18,12 +18,19 @@ #include #include #include "paddle/fluid/operators/math/matrix_solve.h" -#include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/complex_kernel.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/diag_functor.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/slice.h" +#include "paddle/phi/kernels/funcs/unsqueeze.h" +#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/matmul_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" + #define EPSILON 1e-6 namespace paddle { @@ -87,19 +94,19 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info, int values_stride = values->dims()[values->dims().size() - 1]; Tensor rwork; - phi::funcs::Real* rwork_data = nullptr; + phi::dtype::Real* rwork_data = nullptr; rwork.Resize(phi::make_ddim({lda * 2})); - rwork_data = rwork.mutable_data>(context.GetPlace()); + rwork_data = rwork.mutable_data>(context.GetPlace()); // call lapackEig once to compute the size of work; T computed_work_size; - phi::funcs::lapackEig>( + phi::funcs::lapackEig>( jobvl, jobvr, order, input_data, lda, values_data, lvector_data, ldvl, rvector_data, ldvr, &computed_work_size, lwork, rwork_data, &info); lwork = std::max( - 1, static_cast(phi::funcs::Real(computed_work_size))); + 1, static_cast(phi::dtype::Real(computed_work_size))); Tensor work; work.Resize(phi::make_ddim({lwork})); T* work_data = work.mutable_data(context.GetPlace()); @@ -109,7 +116,7 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info, T* current_values = &values_data[i * values_stride]; T* current_rvectors = &rvector_data[i * matrix_stride]; - phi::funcs::lapackEig>( + phi::funcs::lapackEig>( jobvl, jobvr, order, current_matrix, lda, current_values, lvector_data, ldvl, current_rvectors, ldvr, work_data, lwork, rwork_data, &info); PADDLE_ENFORCE_EQ( @@ -207,23 +214,28 @@ class EigKernel : public framework::OpKernel { origin_dim.push_back(last_item * 2); framework::DDim big_dim = phi::make_ddim(origin_dim); - real_values.mutable_data>(big_dim, + real_values.mutable_data>(big_dim, context.GetPlace()); - real_vectors.mutable_data>(x->dims(), + real_vectors.mutable_data>(x->dims(), context.GetPlace()); - ApplyEigKernel>( + ApplyEigKernel>( *x, &real_values, &real_vectors, context); - auto dito = math::DeviceIndependenceTensorOperations< - DeviceContext, phi::funcs::Real, Tout>(context); + + auto& orig_dev_ctx = context.template device_context(); + auto& dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE&>( + orig_dev_ctx); // 1. extract real part & imag part from real_values - Tensor real_part = dito.Slice(real_values, {-1}, {0}, {order}); - Tensor imag_part = dito.Slice(real_values, {-1}, {order}, {order * 2}); + Tensor real_part = + phi::funcs::Slice(dev_ctx, real_values, {-1}, {0}, {order}); + Tensor imag_part = phi::funcs::Slice(dev_ctx, real_values, {-1}, + {order}, {order * 2}); // 2. construct complex values - auto* real_part_data = real_part.data>(); - auto* imag_part_data = imag_part.data>(); + auto* real_part_data = real_part.data>(); + auto* imag_part_data = imag_part.data>(); int out_values_numel = out_values->numel(); platform::ForRange for_range( context.template device_context(), out_values_numel); @@ -233,10 +245,11 @@ class EigKernel : public framework::OpKernel { for_range(functor); // 3. construct complex vectors - Tensor real_vector_trans = dito.Transpose(real_vectors); + Tensor real_vector_trans = + phi::TransposeLast2Dim(dev_ctx, real_vectors); Tensor out_vectors_trans; out_vectors_trans.mutable_data(x->dims(), context.GetPlace()); - ConstructComplexVectors, Tout>( + ConstructComplexVectors, Tout>( &out_vectors_trans, *out_values, real_vector_trans, context, batch_count, order); TransposeTwoAxis(out_vectors_trans, out_vectors, @@ -251,45 +264,48 @@ class EigKernel : public framework::OpKernel { } }; -template +template void ComputeBackwardForComplexInput( const Tensor& V, const Tensor& L, const Tensor& gL, const Tensor& gV, - Tout* x_grad_data, int batch_count, int order, + T* x_grad_data, int batch_count, int order, const framework::ExecutionContext& context) { - auto dito = - math::DeviceIndependenceTensorOperations( - context); - - Tensor trans_v = dito.Transpose(V); - Tensor Vh = dito.Conj(trans_v); - Tensor Lconj = dito.Conj(L); - Tensor Econj = dito.Sub(dito.Unsqueeze(Lconj, -2), dito.Unsqueeze(Lconj, -1)); - Tensor VhgV = dito.Matmul(Vh, gV); - Tensor diag_real = dito.Real(VhgV); - Tensor diag_res = dito.BatchDiag(diag_real, batch_count); - Tensor diag_unsqueezed = dito.Unsqueeze(diag_res, -2); + auto& orig_dev_ctx = context.template device_context(); + auto& dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE&>( + orig_dev_ctx); + + Tensor trans_v = phi::TransposeLast2Dim(dev_ctx, V); + Tensor Vh = phi::Conj(dev_ctx, trans_v); + Tensor Lconj = phi::Conj(dev_ctx, L); + Tensor Econj = phi::Subtract(dev_ctx, phi::funcs::Unsqueeze(Lconj, -2), + phi::funcs::Unsqueeze(Lconj, -1)); + Tensor VhgV = phi::Matmul(dev_ctx, Vh, gV); + Tensor diag_real = phi::Real(dev_ctx, VhgV); + Tensor diag_res = phi::funcs::BatchDiag(dev_ctx, diag_real, batch_count); + Tensor diag_unsqueezed = phi::funcs::Unsqueeze(diag_res, -2); // turn diag_unsqueezed into complex auto numel = diag_unsqueezed.numel(); Tensor diag_unsqueezed_complex; - auto* data_diag_un = diag_unsqueezed.data>(); - auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data( + auto* data_diag_un = diag_unsqueezed.data>(); + auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data( diag_unsqueezed.dims(), context.GetPlace(), - static_cast(numel * sizeof(Tout))); - auto& dev_ctx = context.template device_context(); - platform::ForRange for_range(dev_ctx, numel); - phi::funcs::RealToComplexFunctor functor(data_diag_un, data_diag_un_com, - numel); + static_cast(numel * sizeof(T))); + + platform::ForRange for_range(orig_dev_ctx, numel); + phi::funcs::RealToComplexFunctor functor(data_diag_un, data_diag_un_com, + numel); for_range(functor); // real tensor multiply complex tensor in broadcast manner - Tensor res1 = dito.RealMulComplex(V, diag_unsqueezed_complex); - Tensor res2 = dito.Matmul(Vh, res1); - Tensor result = dito.Sub(VhgV, res2); + Tensor res1 = phi::Multiply(dev_ctx, V, diag_unsqueezed_complex); + Tensor res2 = phi::Matmul(dev_ctx, Vh, res1); + Tensor result = phi::Subtract(dev_ctx, VhgV, res2); - result.mutable_data(V.dims(), context.GetPlace()); - result = dito.Div(result, Econj); - result = dito.DiagFill(order, order, order, 0, gL, result); - Tensor rhs = dito.Matmul(result, Vh); + result.mutable_data(V.dims(), context.GetPlace()); + result = phi::Divide(dev_ctx, result, Econj); + result = + phi::funcs::DiagFill(dev_ctx, order, order, order, 0, gL, result); + Tensor rhs = phi::Matmul(dev_ctx, result, Vh); // solve linear system // solve(Vh, rhs, out, m, k) @@ -298,10 +314,10 @@ void ComputeBackwardForComplexInput( // x_grad: out int m = Vh.dims()[Vh.dims().size() - 1]; int k = rhs.dims()[rhs.dims().size() - 1]; - auto* matrix_data = Vh.data(); - auto* rhs_data = rhs.data(); - math::SolveLinearSystem(matrix_data, rhs_data, x_grad_data, m, k, - batch_count); + auto* matrix_data = Vh.data(); + auto* rhs_data = rhs.data(); + math::SolveLinearSystem(matrix_data, rhs_data, x_grad_data, m, k, + batch_count); } template diff --git a/paddle/fluid/operators/eigh_op.cc b/paddle/fluid/operators/eigh_op.cc index 553d0e679cc6ddebd68c3edbc2de70209364bb53..4e33c567eb6d12fc504bfd76bc83072836feda21 100644 --- a/paddle/fluid/operators/eigh_op.cc +++ b/paddle/fluid/operators/eigh_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/eigh_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -22,42 +25,9 @@ using framework::Tensor; class EighOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Eigh"); - OP_INOUT_CHECK(ctx->HasOutput("Eigenvalues"), "Output", "Eigenvalues", - "Eigh"); - OP_INOUT_CHECK(ctx->HasOutput("Eigenvectors"), "Output", "Eigenvectors", - "Eigh"); - - auto input_dim = ctx->GetInputDim("X"); - auto rank = input_dim.size(); - - PADDLE_ENFORCE_GE(rank, 2, - platform::errors::InvalidArgument( - "The Input(X) should have at least 2 dimensions." - "But received a %d dimension tensor.", - rank)); - PADDLE_ENFORCE_EQ( - input_dim[rank - 2], input_dim[rank - 1], - platform::errors::InvalidArgument( - "Eigh op is designed for square matrix, consequently" - "inner-most 2 dimensions of Input(X) should be symmetric." - "But received X's shape[-2] = %d and shape[-1] = %d.", - input_dim[rank - 2], input_dim[rank - 1])); - - std::vector values_dim; - - for (auto i = 0; i < rank - 1; i++) { - values_dim.emplace_back(input_dim[i]); - } - - ctx->SetOutputDim("Eigenvalues", phi::make_ddim(values_dim)); - ctx->SetOutputDim("Eigenvectors", input_dim); - } }; -class EignOpMaker : public framework::OpProtoAndCheckerMaker { +class EighOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", @@ -140,24 +110,11 @@ class EighGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(eigh, EighInferShapeFunctor, + PD_INFER_META(phi::EighInferMeta)); -REGISTER_OPERATOR(eigh, ops::EighOp, ops::EignOpMaker, +REGISTER_OPERATOR(eigh, ops::EighOp, ops::EighOpMaker, ops::EighGradOpMaker, - ops::EighGradOpMaker); + ops::EighGradOpMaker, + EighInferShapeFunctor); REGISTER_OPERATOR(eigh_grad, ops::EighGradOp); - -REGISTER_OP_CPU_KERNEL( - eigh, ops::EighKernel, - ops::EighKernel, - ops::EighKernel>, - ops::EighKernel>); - -REGISTER_OP_CPU_KERNEL( - eigh_grad, ops::EighGradKernel, - ops::EighGradKernel, - ops::EighGradKernel>, - ops::EighGradKernel>); diff --git a/paddle/fluid/operators/eigh_op.cu b/paddle/fluid/operators/eigh_op.cu deleted file mode 100644 index 827c551637d4df24529508ff37e6a92f157658a0..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/eigh_op.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/eigh_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - eigh, ops::EighKernel, - ops::EighKernel, - ops::EighKernel>, - ops::EighKernel>); - -REGISTER_OP_CUDA_KERNEL( - eigh_grad, ops::EighGradKernel, - ops::EighGradKernel, - ops::EighGradKernel>, - ops::EighGradKernel>); diff --git a/paddle/fluid/operators/eigh_op.h b/paddle/fluid/operators/eigh_op.h deleted file mode 100644 index 294794877b32e5fe2522080a4d388d20564486b4..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/eigh_op.h +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/eigen_values_vectors.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class EighKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto input = ctx.Input("X"); - auto output_w = ctx.Output("Eigenvalues"); - auto output_v = ctx.Output("Eigenvectors"); - std::string lower = ctx.Attr("UPLO"); - bool is_lower = (lower == "L"); - math::MatrixEighFunctor functor; - functor(ctx, *input, output_w, output_v, is_lower, true); - } -}; - -template -class EighGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using ValueType = phi::funcs::Real; - auto& x_grad = *ctx.Output(framework::GradVarName("X")); - x_grad.mutable_data(ctx.GetPlace()); - auto& output_w = *ctx.Input("Eigenvalues"); - auto& output_v = *ctx.Input("Eigenvectors"); - auto& output_w_grad = - *ctx.Input(framework::GradVarName("Eigenvalues")); - auto& output_v_grad = - *ctx.Input(framework::GradVarName("Eigenvectors")); - - auto& dims = output_v.dims(); - const int m = dims[dims.size() - 1]; - auto dito = - math::DeviceIndependenceTensorOperations( - ctx); - auto tV = dito.Transpose(dito.Conj(output_v)); - auto W = dito.template Sub(dito.Unsqueeze(output_w, -2), - dito.Unsqueeze(output_w, -1)); - Tensor result = dito.Matmul(tV, output_v_grad); - result.mutable_data(dims, ctx.GetPlace()); - std::vector out_shape = phi::vectorize(dims); - auto constant = dito.Fill(out_shape, 0.5); - result = dito.Sub(result, dito.Conj(dito.Transpose(result))); - result = dito.Mul(result, constant); - result = dito.Div(result, W); - result = dito.DiagFill(m, m, m, 0, output_w_grad, result); - x_grad = dito.Matmul(output_v, dito.Matmul(result, tV)); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/eigvals_op.h b/paddle/fluid/operators/eigvals_op.h index 59eabfb29b97ee66ad470ff4e0ed65f6b5db76f4..4627acc0d07defcd0f6fc6dd82aaaac8c0f148ca 100644 --- a/paddle/fluid/operators/eigvals_op.h +++ b/paddle/fluid/operators/eigvals_op.h @@ -48,7 +48,7 @@ struct PaddleComplex< template using PaddleCType = typename PaddleComplex::type; template -using Real = typename phi::funcs::Real; +using Real = typename phi::dtype::Real; static void SpiltBatchSquareMatrix(const Tensor& input, std::vector* output) { @@ -144,7 +144,7 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input, required_work_mem, work_mem)); int64_t rwork_mem = rwork->memory_size(); - int64_t required_rwork_mem = (n_dim << 1) * sizeof(phi::funcs::Real); + int64_t required_rwork_mem = (n_dim << 1) * sizeof(phi::dtype::Real); PADDLE_ENFORCE_GE( rwork_mem, required_rwork_mem, platform::errors::InvalidArgument( @@ -154,11 +154,11 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input, required_rwork_mem, rwork_mem)); int info = 0; - phi::funcs::lapackEig>( + phi::funcs::lapackEig>( 'N', 'N', static_cast(n_dim), a.template data(), static_cast(n_dim), output->template data(), NULL, 1, NULL, 1, work->template data(), static_cast(work_mem / sizeof(T)), - rwork->template data>(), &info); + rwork->template data>(), &info); std::string name = "framework::platform::dynload::cgeev_"; if (framework::TransToProtoVarType(input.dtype()) == @@ -188,10 +188,10 @@ class EigvalsKernel : public framework::OpKernel { // query workspace size T qwork; int info; - phi::funcs::lapackEig>( + phi::funcs::lapackEig>( 'N', 'N', static_cast(n_dim), input_matrices[0].template data(), static_cast(n_dim), NULL, NULL, 1, NULL, 1, &qwork, -1, - static_cast*>(NULL), &info); + static_cast*>(NULL), &info); int64_t lwork = static_cast(qwork); Tensor work, rwork; @@ -208,7 +208,7 @@ class EigvalsKernel : public framework::OpKernel { } if (framework::IsComplexType( framework::TransToProtoVarType(input->dtype()))) { - rwork.mutable_data>(phi::make_ddim({n_dim << 1}), + rwork.mutable_data>(phi::make_ddim({n_dim << 1}), ctx.GetPlace()); } diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.kps b/paddle/fluid/operators/elementwise/elementwise_add_op.kps index d6e0749318e901947b46b4b1d6ff8bbdb16bef36..3b7457d72e15d733a45bc10ea433db1937dbac89 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.kps +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.kps @@ -39,7 +39,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #else #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" -#include "paddle/phi/kernels/gpu/elementwise.h" +#include "paddle/phi/kernels/gpu/elementwise_grad.h" #endif namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc index 38cd232e4d1d2237cb5da014d11ba69a91cbe917..13fd9b81a8765aea140ad6ca2fc0383151a51dc7 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc @@ -102,42 +102,6 @@ REGISTER_OPERATOR( REGISTER_OPERATOR(elementwise_div_grad_grad, ops::ElementwiseDivOpDoubleGrad, ops::ElementwiseDoubleGradOpInplaceInferer); -REGISTER_OP_CPU_KERNEL( - elementwise_div, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel>, - ops::ElementwiseDivKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_div_grad, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel>, - ops::ElementwiseDivGradKernel>); - -REGISTER_OP_CPU_KERNEL( - elementwise_div_grad_grad, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel>, - ops::ElementwiseDivDoubleGradKernel>); - REGISTER_OP_VERSION(elementwise_div) .AddCheckpoint( R"ROC(Register elementwise_div for adding the attribute of Scale_y)ROC", diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu deleted file mode 100644 index 9eb4b0352e5337e3fdd758d2e95cfa61d1d62724..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/elementwise_div_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -namespace paddle { -namespace operators { - -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseDivGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - const auto& dev_ctx = ctx.template device_context(); - const auto place = ctx.GetPlace(); - if (dx != nullptr && dy != nullptr) { - std::vector ins = {dout, out, y}; - GetGradXAndYOut( - dev_ctx, place, axis, ins, dout, dx, dy, DivGradXYFunctor()); - } else if (dx != nullptr && dy == nullptr) { - std::vector ins = {dout, y}; - GetGradXOrYOut(dev_ctx, place, axis, ins, dout, - dx, DivGradXFunctor()); - } else if (dy != nullptr && dx == nullptr) { - std::vector ins = {dout, out, y}; - GetGradXOrYOut( - dev_ctx, place, axis, ins, dout, dy, DivGradYFunctor()); - } -} - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL( - elementwise_div, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel>, - ops::ElementwiseDivKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_div_grad, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel>, - ops::ElementwiseDivGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_div_grad_grad, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel>, - ops::ElementwiseDivDoubleGradKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h index c58a7f36548a57a1c8e7770fa282470fba4cc140..e9adb9abdb528c187817be641b81ffb6f64833b0 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h @@ -20,142 +20,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -void default_elementwise_sub(const framework::ExecutionContext& ctx, - const framework::Tensor* x, - const framework::Tensor* y, framework::Tensor* z) { - int axis = ctx.Attr("axis"); - auto x_dims = x->dims(); - auto y_dims = y->dims(); - if (x_dims.size() >= y_dims.size()) { - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - SubFunctor(), z); - } else { - ElementwiseComputeEx, DeviceContext, T>( - ctx, x, y, axis, InverseSubFunctor(), z); - } -} - -template -void default_elementwise_div(const framework::ExecutionContext& ctx, - const framework::Tensor* x, - const framework::Tensor* y, framework::Tensor* z) { - int axis = ctx.Attr("axis"); - auto x_dims = x->dims(); - auto y_dims = y->dims(); - if (x_dims.size() >= y_dims.size()) { - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - DivFunctor(), z); - } else { - ElementwiseComputeEx, DeviceContext, T>( - ctx, x, y, axis, InverseDivFunctor(), z); - } -} - -template -class ElementwiseDivKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - z->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.device_context(); - int axis = ctx.Attr("axis"); - auto pt_x = paddle::experimental::MakePhiDenseTensor(*x); - auto pt_y = paddle::experimental::MakePhiDenseTensor(*y); - auto pt_z = paddle::experimental::MakePhiDenseTensor(*z); - phi::DivideRawKernel( - static_cast::TYPE&>(dev_ctx), - *pt_x.get(), *pt_y.get(), axis, pt_z.get()); - } -}; - -template -struct DivGradDX { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; } -}; - -template -struct DivGradDX> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex y_conj(y.real, -y.imag); - return dout / y_conj; - } -}; - -template -struct DivGradDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return -dout * out / y; - } -}; - -template -struct DivGradDY> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex out_div_y_conj((out / y).real, - -(out / y).imag); - return -dout * out_div_y_conj; - } -}; - -template -struct DivDoubleDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return y * out * dout - x * dout; - } -}; - -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseDivGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - - ElemwiseGradCompute, DivGradDY>( - ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX(), DivGradDY()); -} - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseDivGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy); -#endif - -template -class ElementwiseDivGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - ElementwiseDivGrad(ctx, x, y, out, dout, dx, dy); - } -}; - class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -206,80 +70,5 @@ class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel { } }; -template -class ElementwiseDivDoubleGradKernel : public framework::OpKernel { - using Tensor = framework::Tensor; - - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* Y = ctx.Input("Y"); - auto* Out = ctx.Input("Out"); - auto* ddX = ctx.Input("DDX"); - auto* ddY = ctx.Input("DDY"); - auto* dX = ctx.Input("DX"); - - auto* dY = ctx.Output(framework::GradVarName("Y")); - auto* dOut = ctx.Output("DOut"); - auto* ddOut = ctx.Output("DDOut"); - - int axis = ctx.Attr("axis"); - - if (dY) dY->mutable_data(Y->dims(), ctx.GetPlace()); - if (dOut) dOut->mutable_data(Out->dims(), ctx.GetPlace()); - if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); - - // ddX_safe == null ? 0 : ddX - // ddY_safe == null ? 0 : ddY - Tensor ddX_safe, ddY_safe; - GetDoubleGradSafeTensor(ctx, dX, ddX, &ddX_safe); - GetDoubleGradSafeTensor(ctx, Y, ddY, &ddY_safe); - - // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y - // dY = Out * dX * ddY / Y - dX * ddX / Y - // dOut = - dX * ddY - // To save memory, (1) dout can be used as 'tmp' tensor, (2) ddout can - // inplace ddx - Tensor tmp; - if (dOut) { - tmp = *dOut; - } else { - auto& dev_ctx = ctx.template device_context(); - tmp = ctx.AllocateTmpTensor(Out->dims(), dev_ctx); - } - if (dY) { - // dX_div_Y = dX / Y; - Tensor dX_div_Y = tmp; - default_elementwise_div(ctx, dX, Y, &dX_div_Y); - - // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the - // first output tensor is nullptr, the branch to calculate first - // output tensor will not be activated, DivGradDx function will not - // be called and can be ignored, the first branch has little effect - // on running speed. - - // dY = Out * dX * ddY / Y - dX * ddX / Y - ElemwiseGradCompute, DivDoubleDY>( - ctx, ddX_safe, ddY_safe, *Out, dX_div_Y, axis, nullptr, dY, - DivGradDX(), DivDoubleDY()); - } - - if (ddOut) { - // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y - default_elementwise_mul(ctx, Out, &ddY_safe, &tmp); - default_elementwise_sub(ctx, &ddX_safe, &tmp, &tmp); - default_elementwise_div(ctx, &tmp, Y, ddOut); - } - - if (dOut) { - // dOut = - dX * ddY - default_elementwise_mul(ctx, dX, &ddY_safe, dOut); - auto& place = - *ctx.template device_context().eigen_device(); - auto dout = framework::EigenVector::Flatten(*dOut); - dout.device(place) = static_cast(-1) * dout; - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h index 86f5be3071c2d1a84f13da1cef74787003e633bb..14baeaa74d2421135401e94fbc10367d50b876fe 100644 --- a/paddle/fluid/operators/elementwise/elementwise_functor.h +++ b/paddle/fluid/operators/elementwise/elementwise_functor.h @@ -90,67 +90,6 @@ struct MinFunctor { template using Complex = paddle::platform::complex; -template -struct DivGradXYFunctor { - inline HOSTDEVICE phi::Array operator()(const InT a, const InT b, - const InT c) { - // dx = dout / y - // dy = - dout * out / y - phi::Array outs; - outs[0] = a / c; - outs[1] = -a * b / c; - return outs; - } -}; - -template -struct DivGradXYFunctor, Complex> { - inline HOSTDEVICE phi::Array, 2> operator()( - const Complex a, const Complex b, const Complex c) { - phi::Array, 2> outs; - Complex c_conj(c.real, -c.imag); - Complex out_div_c_conj((b / c).real, -(b / c).imag); - outs[0] = a / c_conj; - outs[1] = -a * out_div_c_conj; - return outs; - } -}; - -// Float div grad -template -struct DivGradXFunctor { - inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; } -}; - -// Complex div grad -template -struct DivGradXFunctor> { - inline HOSTDEVICE Complex operator()(const Complex a, - const Complex b) const { - Complex b_conj(b.real, -b.imag); - return a / b_conj; - } -}; - -// Float mul and div -template -struct DivGradYFunctor { - inline HOSTDEVICE T operator()(const T a, const T b, const T c) const { - return -a * b / c; - } -}; - -// Complex mul and div -template -struct DivGradYFunctor> { - inline HOSTDEVICE Complex operator()(const Complex a, - const Complex b, - const Complex c) const { - Complex out_div_c_conj((b / c).real, -(b / c).imag); - return -a * out_div_c_conj; - } -}; - // Fmax template struct FMaxFunctor { @@ -257,47 +196,6 @@ struct MinGradXYFunctor { } }; -template -struct MulGradFunctor { - inline HOSTDEVICE T operator()(const T a, const T b) const { return a * b; } -}; -template -struct MulGradFunctor> { - inline HOSTDEVICE Complex operator()(const Complex a, - const Complex b) const { - Complex b_conj(b.real, -b.imag); - return a * b_conj; - } -}; - -template -struct MulGradXYFunctor { - inline HOSTDEVICE phi::Array operator()(const InT a, const InT b, - const InT c) { - phi::Array outs; - // dx = dout * y - outs[0] = a * b; - // dy = dout * x - outs[1] = a * c; - return outs; - } -}; - -template -struct MulGradXYFunctor, Complex> { - inline HOSTDEVICE phi::Array, 2> operator()( - const Complex a, const Complex b, const Complex c) { - phi::Array, 2> outs; - // dx = dout * y - Complex b_conj(b.real, -b.imag); - outs[0] = a * b_conj; - // dy = dout * x - Complex c_conj(c.real, -c.imag); - outs[1] = a * c_conj; - return outs; - } -}; - // Ternary compare template struct MaxGradXFunctor { diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc index e172279145e28c0731ed0d8d91769d0b293662fe..830e09eeae4811eb44bd4e21e17fe83ee44c592d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc @@ -173,55 +173,6 @@ REGISTER_OP_CPU_KERNEL( paddle::platform::complex>, ops::ElementwiseMulKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel>, - ops::ElementwiseMulGradKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_mul_grad_grad, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel>, - ops::ElementwiseMulDoubleGradKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_mul_triple_grad, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel>, - ops::ElementwiseMulTripleGradKernel>); REGISTER_OP_VERSION(elementwise_mul) .AddCheckpoint( diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index 45c87a27a180af4798a9f8b31e2edfd0cacb583d..f7b9fd1e265f5d3f107e734f9ffdcc90e7f6cc77 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -63,33 +63,6 @@ class ElementwiseMulKernel } }; -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - const auto& dev_ctx = - ctx.template device_context(); - const auto place = ctx.GetPlace(); - - if (dx != nullptr && dy != nullptr) { - std::vector ins = {dout, y, x}; - GetGradXAndYOut( - dev_ctx, place, axis, ins, dout, dx, dy, MulGradXYFunctor()); - } else if (dx != nullptr && dy == nullptr) { - std::vector ins = {dout, y}; - GetGradXOrYOut(dev_ctx, place, axis, ins, dout, - dx, MulGradFunctor()); - } else if (dx == nullptr && dy != nullptr) { - std::vector ins = {dout, x}; - GetGradXOrYOut(dev_ctx, place, axis, ins, dout, - dy, MulGradFunctor()); - } -} - } // namespace operators } // namespace paddle @@ -103,44 +76,3 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMulKernel, ops::ElementwiseMulKernel>, ops::ElementwiseMulKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel>, - ops::ElementwiseMulGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_mul_grad_grad, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel>, - ops::ElementwiseMulDoubleGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_mul_triple_grad, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel>, - ops::ElementwiseMulTripleGradKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index c81266d584468f51030026e1423a649252001f58..58a3123c7e332f50b0830577436528f1e8df1cdf 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -137,244 +137,6 @@ class ElementwiseMulKernel : public framework::OpKernel { } } }; -template -struct MulGradDX { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; } -}; - -template -struct MulGradDX> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex y_conj(y.real, -y.imag); - return dout * y_conj; - } -}; - -template -struct MulGradDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; } -}; - -template -struct MulGradDY> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex x_conj(x.real, -x.imag); - return dout * x_conj; - } -}; -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - ElemwiseGradCompute, MulGradDY>( - ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX(), MulGradDY()); -} - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy); -#endif - -template -class ElementwiseMulGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* out = dout; // out is not necessary - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - ElementwiseMulGrad(ctx, x, y, out, dout, dx, dy); - } -}; - -template -class ElementwiseMulDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input("DOut"); - auto* ddx = ctx.Input("DDX"); - auto* ddy = ctx.Input("DDY"); - - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - auto* ddout = ctx.Output("DDOut"); - - if (ddout) ddout->mutable_data(ctx.GetPlace()); - - Tensor ddx_safe, ddy_safe; - GetDoubleGradSafeTensor(ctx, x, ddx, &ddx_safe); - GetDoubleGradSafeTensor(ctx, y, ddy, &ddy_safe); - - // dx = dout * ddy - // dy = dout * ddx - // ddout = ddx * y + x * ddy - // change computation sequence to save memory, so ddout can inplace ddx and - // dx can be used as 'tmp' tensor - // (1) dx = x * ddy - // (2) dy = dout * ddx - // (3) ddout = ddx * y - // (4) ddout = ddout + dx - // (5) dx = dout * ddy - if (ddout) { - int axis = ctx.Attr("axis"); - auto& place = - *ctx.template device_context().eigen_device(); - // size(ddout) > size(ddx), ddout can't use memory of ddx using inplace - if (ddout->numel() > ddx->numel()) { - ElemwiseGradCompute, MulGradDY>( - ctx, ddx_safe, ddy_safe, *dout, *dout, axis, dx, dy, MulGradDX(), - MulGradDY()); - - Tensor ddout_tmp; - ddout_tmp.mutable_data(ddout->dims(), ctx.GetPlace()); - - default_elementwise_mul(ctx, y, &ddx_safe, ddout); - default_elementwise_mul(ctx, &ddy_safe, x, - &ddout_tmp); - - auto ddout_t = framework::EigenVector::Flatten(*ddout); - auto ddout_tmp_t = framework::EigenVector::Flatten(ddout_tmp); - ddout_t.device(place) = ddout_t + ddout_tmp_t; - } else { - // use dx to save memory, other than alloc tmp tensor - Tensor* ddout_tmp = dx; - - default_elementwise_mul(ctx, x, &ddy_safe, ddout_tmp); - // NOTE: in the following ElemwiseGradCompute, for the - // first output tensor is nullptr, the branch to calculate first - // output tensor will not be activated, DivGradDx function will not - // be called and can be ignored, the first branch has little effect - // on running speed. - ElemwiseGradCompute, MulGradDY>( - ctx, ddx_safe, ddy_safe, *dout, *dout, axis, nullptr, dy, - MulGradDX(), MulGradDY()); - default_elementwise_mul(ctx, &ddx_safe, y, ddout); - - auto ddout_t = framework::EigenVector::Flatten(*ddout); - auto ddout_tmp_t = framework::EigenVector::Flatten(*ddout_tmp); - ddout_t.device(place) = ddout_t + ddout_tmp_t; - default_elementwise_mul(ctx, dout, &ddy_safe, dx); - } - } - } -}; - -template -class ElementwiseMulTripleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using Tensor = framework::Tensor; - // get input - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input("DOut"); - auto* ddx = ctx.Input("DDX"); - auto* ddy = ctx.Input("DDY"); - - auto* d_dx = ctx.Input("D_DX"); - auto* d_dy = ctx.Input("D_DY"); - auto* d_ddout = ctx.Input("D_DDOut"); - - // get output - auto* out_d_x = ctx.Output("D_X"); - auto* out_d_y = ctx.Output("D_Y"); - auto* out_d_dout = ctx.Output("D_DOut"); - - auto* out_d_ddx = ctx.Output("D_DDX"); - auto* out_d_ddy = ctx.Output("D_DDY"); - - if (out_d_x) out_d_x->mutable_data(x->dims(), ctx.GetPlace()); - if (out_d_y) out_d_y->mutable_data(y->dims(), ctx.GetPlace()); - if (out_d_dout) out_d_dout->mutable_data(dout->dims(), ctx.GetPlace()); - if (out_d_ddx) out_d_ddx->mutable_data(x->dims(), ctx.GetPlace()); - if (out_d_ddy) out_d_ddy->mutable_data(y->dims(), ctx.GetPlace()); - - auto& place = *ctx.template device_context().eigen_device(); - - Tensor ddx_safe, ddy_safe; - GetDoubleGradSafeTensor(ctx, x, ddx, &ddx_safe); - GetDoubleGradSafeTensor(ctx, y, ddy, &ddy_safe); - - if (d_ddout) { - if (out_d_x) { - // out_d_x = ddy * d_ddout - default_elementwise_mul(ctx, &ddy_safe, d_ddout, - out_d_x); - } - if (out_d_y) { - // out_d_y = ddx * d_ddout - default_elementwise_mul(ctx, &ddx_safe, d_ddout, - out_d_y); - } - } - - if (out_d_dout) { - // get out_d_dout - // out_d_dout = ddy * d_dx + d_dy * ddx - Tensor out_d_dout_tmp; - out_d_dout_tmp.mutable_data(dout->dims(), ctx.GetPlace()); - default_elementwise_mul(ctx, d_dy, &ddx_safe, - out_d_dout); - default_elementwise_mul(ctx, &ddy_safe, d_dx, - &out_d_dout_tmp); - auto out_d_dout_t = framework::EigenVector::Flatten(*out_d_dout); - auto out_d_dout_tmp_t = - framework::EigenVector::Flatten(out_d_dout_tmp); - out_d_dout_t.device(place) = out_d_dout_t + out_d_dout_tmp_t; - } - - if (out_d_ddx) { - // get out_d_ddx - // out_d_ddx = dout * d_dy + y * d_ddout - Tensor out_d_ddx_tmp; - out_d_ddx_tmp.mutable_data(ddx->dims(), ctx.GetPlace()); - default_elementwise_mul(ctx, dout, d_dy, out_d_ddx); - default_elementwise_mul(ctx, y, d_ddout, - &out_d_ddx_tmp); - auto out_d_ddx_t = framework::EigenVector::Flatten(*out_d_ddx); - auto out_d_ddx_tmp_t = framework::EigenVector::Flatten(out_d_ddx_tmp); - out_d_ddx_t.device(place) = out_d_ddx_t + out_d_ddx_tmp_t; - } - - if (out_d_ddy) { - // get out_d_ddy - // out_d_ddy = dout * d_dx + x * d_ddout - Tensor out_d_ddy_tmp; - out_d_ddy_tmp.mutable_data(ddy->dims(), ctx.GetPlace()); - default_elementwise_mul(ctx, dout, d_dx, out_d_ddy); - default_elementwise_mul(ctx, x, d_ddout, - &out_d_ddy_tmp); - auto out_d_ddy_t = framework::EigenVector::Flatten(*out_d_ddy); - auto out_d_ddy_tmp_t = framework::EigenVector::Flatten(out_d_ddy_tmp); - out_d_ddy_t.device(place) = out_d_ddy_t + out_d_ddy_tmp_t; - } - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h index 418779c32e8bc216be1532bf714bc21d91c452aa..102127e6ffe4ea60b8305c718e645a3695557ae4 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h @@ -16,9 +16,6 @@ #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -// only can include the headers in paddle/top/api dirs -#include "paddle/phi/kernels/gpu/elementwise.h" - namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index a1a7f8310986616d0a9f7db572ed31ca44399027..80b07721f0b4d1feb669bfce91127b0887d79391 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -31,6 +31,7 @@ limitations under the License. */ #include "paddle/phi/api/lib/utils/tensor_utils.h" #include "paddle/phi/kernels/cpu/elementwise.h" +#include "paddle/phi/kernels/cpu/elementwise_grad.h" #if defined(__NVCC__) || defined(__HIPCC__) #ifdef __NVCC__ @@ -44,6 +45,7 @@ limitations under the License. */ #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/kernels/gpu/elementwise_grad.h" #endif @@ -133,7 +135,7 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims, inline framework::DDim trim_trailing_singular_dims( const framework::DDim &dims) { - return phi::funcs::trim_trailing_singular_dims(dims); + return phi::funcs::TrimTrailingSingularDims(dims); } template (); - if (x.dims() == y.dims()) { - phi::funcs::ElemwiseGradComputeNoBroadcast( - dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); - } else { - phi::ElemwiseGradComputeWithBroadcast( - dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); - } + phi::funcs::ElemwiseGradCompute( + dev_ctx, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } // It is a common implementation to compute binary calculation with the support @@ -173,19 +167,9 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx, const framework::Tensor *y, int axis, Functor func, framework::Tensor *z) { z->mutable_data(ctx.GetPlace()); - if (platform::is_gpu_place(ctx.GetPlace())) { -#if defined(__NVCC__) || defined(__HIPCC__) - const auto &dev_ctx = - ctx.template device_context(); - phi::ElementwiseCompute(dev_ctx, *x, *y, axis, func, - z); - -#endif - return; - } - const auto &dev_ctx = - ctx.template device_context(); - phi::ElementwiseCompute(dev_ctx, *x, *y, axis, func, z); + const auto &dev_ctx = ctx.template device_context(); + phi::funcs::ElementwiseCompute(dev_ctx, *x, *y, axis, + func, z); } // FusedElemwiseAndAct @@ -443,8 +427,8 @@ void FusedElemwiseAndActComputeWithBroadcast( axis = (y_dim.size() == 0) ? x_dim.size() : axis; int pre, n, post, is_run_common_broadcast; - phi::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, - &is_run_common_broadcast); + phi::funcs::GetMidDims(x_dim, y_dim, axis, &pre, &n, &post, + &is_run_common_broadcast); if (post == 1) { int h = pre; int w = n; @@ -991,8 +975,8 @@ void FusedElemwiseAndActGradComputeWithBroadcast( axis = (y_dim.size() == 0) ? x_dim.size() : axis; int pre, n, post, is_run_common_broadcast; - phi::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, - &is_run_common_broadcast); + phi::funcs::GetMidDims(x_dim, y_dim, axis, &pre, &n, &post, + &is_run_common_broadcast); const T *x_data = nullptr; const T *y_data = nullptr; if (x->IsInitialized()) x_data = x->data(); @@ -1183,14 +1167,6 @@ static inline std::vector GetReduceDim(const framework::DDim &in, } #if defined(__NVCC__) || defined(__HIPCC__) -template -void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis, - framework::Tensor *src, framework::Tensor *dst) { - std::vector reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis); - TensorReduceImpl>( - dev_ctx, *src, dst, kps::IdentityFunctor(), reduce_dims, - dev_ctx.stream()); -} template void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx, @@ -1198,36 +1174,8 @@ void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx, std::vector ins, const framework::Tensor *dout, framework::Tensor *dx, framework::Tensor *dy, Functor func) { - framework::Tensor tmp_dx; - framework::Tensor tmp_dy; - dx->mutable_data(place); - dy->mutable_data(place); - std::vector outs; - if (dx->dims() == dout->dims() && dy->dims() == dout->dims()) { - outs = {dx, dy}; - } else if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) { - tmp_dx.mutable_data(dout->dims(), place); - outs = {&tmp_dx, dy}; - } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) { - tmp_dy.mutable_data(dout->dims(), place); - outs = {dx, &tmp_dy}; - } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) { - tmp_dy.mutable_data(dout->dims(), place); - tmp_dx.mutable_data(dout->dims(), place); - outs = {&tmp_dx, &tmp_dy}; - } - - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, axis, func); - - if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) { - ReduceWrapper(dev_ctx, axis, &tmp_dx, dx); - } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) { - ReduceWrapper(dev_ctx, axis, &tmp_dy, dy); - } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) { - ReduceWrapper(dev_ctx, axis, &tmp_dx, dx); - ReduceWrapper(dev_ctx, axis, &tmp_dy, dy); - } + phi::GetGradXAndYOut(dev_ctx, place, axis, ins, *dout, dx, dy, + func); } template @@ -1236,22 +1184,8 @@ void GetGradXOrYOut(const platform::CUDADeviceContext &dev_ctx, std::vector ins, const framework::Tensor *dout, framework::Tensor *dxy, Functor func) { - framework::Tensor tmp_dxy; - dxy->mutable_data(place); - - std::vector outs; - if (dxy->dims() != dout->dims()) { - tmp_dxy.mutable_data(dout->dims(), place); - outs = {&tmp_dxy}; - } else { - outs = {dxy}; - } - - paddle::operators::LaunchElementwiseCudaKernel(dev_ctx, ins, &outs, - axis, func); - if (dxy->dims() != dout->dims()) { - ReduceWrapper(dev_ctx, axis, &tmp_dxy, dxy); - } + phi::GetGradXOrYOut(dev_ctx, place, axis, ins, *dout, dxy, + func); } #endif diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h index 7d7bb4f26fcf42ec63cd1fab7ec2667a03c8ba4c..f49e2ab4e173efbd2cb8a33ec3e7471faff11154 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h @@ -19,7 +19,7 @@ limitations under the License. */ // only can include the headers in paddle/top/api dirs #include "paddle/phi/api/lib/utils/tensor_utils.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc index 1f8a95f0286bd3bb228bcda59e1198bf0763eb9a..3e9263fe93acd93638ff9e496203b7ea432cea86 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc +++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -33,7 +32,7 @@ namespace p = paddle::platform; USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, NPU); -USE_OP(elementwise_sub); +USE_OP_ITSELF(elementwise_sub); USE_OP_DEVICE_KERNEL(elementwise_sub, NPU); template diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc index 14b20baae1b0398a40ee74a3e16c2c992a4b557e..78855dd39572539e531bcd8ad3786ae95269ca8f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc @@ -14,7 +14,6 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/operators/elementwise/elementwise_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" #include "paddle/fluid/operators/elementwise/elementwise_xpu.h" #include "xpu/refactor/math.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc index b2cef95d1a349d66161db1c3edf7c14bc8a6d058..d15a7c272757fa683f835215e3db9ccec956af38 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" - #include #include "paddle/fluid/operators/elementwise/elementwise_op.h" @@ -78,10 +76,16 @@ class ElementwiseSubDoubleGradMaker : public framework::SingleGradOpMaker { } // namespace paddle REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub); -REGISTER_ELEMWISE_EXPLICIT_OP_WITHOUT_GRAD(elementwise_sub, Sub); namespace ops = paddle::operators; +REGISTER_OPERATOR(elementwise_sub, ::paddle::operators::ElementwiseOp, + ::paddle::operators::ElementwiseSubOpMaker, + ::paddle::operators::ElementwiseOpInferVarType, + elementwise_subGradMaker<::paddle::framework::OpDesc>, + elementwise_subGradMaker<::paddle::imperative::OpBase>, + ::paddle::operators::ElementwiseOpInplaceInferer); + REGISTER_OPERATOR( elementwise_sub_grad, ops::ElementwiseOpGrad, ops::ElementwiseGradOpInplaceInferer, ops::ElementwiseGradNoBufVarsInferer, @@ -92,51 +96,6 @@ REGISTER_OPERATOR(elementwise_sub_grad_grad, ops::ElementwiseDoubleGradOpInplaceInferer, ops::ElementwiseDoubleGradNoBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - elementwise_sub, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel>, - ops::ElementwiseSubKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_sub_grad, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel>, - ops::ElementwiseSubGradKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_sub_grad_grad, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel>, - ops::ElementwiseSubDoubleGradKernel>); - REGISTER_OP_VERSION(elementwise_sub) .AddCheckpoint( R"ROC(Register elementwise_sub for adding the attribute of Scale_y)ROC", diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu deleted file mode 100644 index 2c962af9877b978f7a6af25635f345c0ae5ffd27..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - elementwise_sub, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel>, - ops::ElementwiseSubKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_sub_grad, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel>, - ops::ElementwiseSubGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_sub_grad_grad, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel>, - ops::ElementwiseSubDoubleGradKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h deleted file mode 100644 index 15c547b493ae045c13ab8d6b14a646cb92716a92..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/operators/elementwise/elementwise_op.h" -#include "paddle/fluid/platform/place.h" - -#include "paddle/phi/kernels/elementwise_grad_kernel.h" -#include "paddle/phi/kernels/math_kernel.h" -namespace paddle { -namespace operators { - -template -class ElementwiseSubKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - z->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.device_context(); - int axis = ctx.Attr("axis"); - phi::SubtractRawKernel( - static_cast::TYPE&>(dev_ctx), - *x, *y, axis, z); - } -}; - -template -class ElementwiseSubGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); - auto& dev_ctx = ctx.device_context(); - - phi::SubtractGradKernel( - static_cast::TYPE&>(dev_ctx), - *x, *y, *dout, axis, dx, dy); - } -}; - -template -class ElementwiseSubDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using Tensor = framework::Tensor; - - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input("DOut"); - auto* ddx = ctx.Input("DDX"); - auto* ddy = ctx.Input("DDY"); - - auto* ddout = ctx.Output("DDOut"); - int axis = ctx.Attr("axis"); - auto& dev_ctx = ctx.device_context(); - - paddle::optional ddx_optional = paddle::none; - paddle::optional ddy_optional = paddle::none; - if (ddx != nullptr) { - ddx_optional = *ddx; - } - if (ddy != nullptr) { - ddy_optional = *ddy; - } - phi::SubtractDoubleGradKernel( - static_cast::TYPE&>(dev_ctx), - *y, ddx_optional, ddy_optional, *dout, axis, ddout); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc index b68d38d6df12a5d11f57b1556f8fc7ceec00d3e0..4169a938f2d0bff0cf8b23db35c943c9ff586212 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc index d12c6fc30cebaafd27c099ab708e0662477cb017..87c494b0e10bad64566b5248946c9b8b1b778f2f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_xpu.h" #include "xpu/refactor/math.h" diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc index 5222103256d614a2d6b1fa10662367ecb20d3cb2..ea009a38056f078689bd6dc4c9a41d2b34e8c1fa 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc @@ -17,8 +17,13 @@ #include "paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add); +PD_DECLARE_KERNEL(add_double_grad, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(add_double_grad, GPU, ALL_LAYOUT); +#endif namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc index 9d4d11609ac2047aa8934cb2868f79359a816e12..ce5c6b701d95894db8e3a84215f537352914706a 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc @@ -21,9 +21,12 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc index 9aa206efed8c0111f56b6651e0228acc316b1bfe..3cecc52a3c481cf9cb4a1e2eba6ded704a8fa8ee 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc @@ -27,8 +27,14 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/kernel_registry.h" -USE_OP(elementwise_div); +USE_OP_ITSELF(elementwise_div); + +PD_DECLARE_KERNEL(divide_double_grad, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(divide_double_grad, GPU, ALL_LAYOUT); +#endif namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc index e23342ebb5dc7639d68500964bfdfbd099d077cd..9e0e4e7fe1c6d26df7c4347d8bc81a985e6c973b 100644 --- a/paddle/fluid/operators/empty_op.cc +++ b/paddle/fluid/operators/empty_op.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/empty_op.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/infermeta/nullary.h" + namespace paddle { namespace operators { @@ -51,46 +53,6 @@ class EmptyOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* context) const override { - OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty"); - - if (context->HasInput("ShapeTensor")) { - auto shape_dims = context->GetInputDim("ShapeTensor"); - int num_ele = 1; - for (int i = 0; i < shape_dims.size(); ++i) { - num_ele *= shape_dims[i]; - } - auto vec_dims = std::vector(num_ele, -1); - context->SetOutputDim("Out", phi::make_ddim(vec_dims)); - } else if (context->HasInputs("ShapeTensorList")) { - std::vector out_dims; - auto dims_list = context->GetInputsDim("ShapeTensorList"); - for (size_t i = 0; i < dims_list.size(); ++i) { - auto& dims = dims_list[i]; - PADDLE_ENFORCE_EQ(dims, phi::make_ddim({1}), - platform::errors::InvalidArgument( - "The shape of Tensor in list must be [1]. " - "But received the shape is [%s]", - dims)); - - out_dims.push_back(-1); - } - - context->SetOutputDim("Out", phi::make_ddim(out_dims)); - } else { - auto& shape = context->Attrs().Get>("shape"); - for (size_t i = 0; i < shape.size(); ++i) { - PADDLE_ENFORCE_GE( - shape[i], 0, - platform::errors::InvalidArgument( - "Each value of attribute 'shape' is expected to be no less " - "than 0. But recieved: shape[%u] = %d; shape = [%s].", - i, shape[i], phi::make_ddim(shape))); - } - context->SetOutputDim("Out", phi::make_ddim(shape)); - } - } - protected: framework::OpKernelType GetKernelTypeForVar( const std::string& var_name, const framework::Tensor& tensor, @@ -126,14 +88,8 @@ class EmptyOpVarTypeInference : public framework::VarTypeInference { namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OPERATOR( - empty, ops::EmptyOp, ops::EmptyOpMaker, ops::EmptyOpVarTypeInference, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -REGISTER_OP_CPU_KERNEL(empty, ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel); +DECLARE_INFER_SHAPE_FUNCTOR(empty, EmptyInferShapeFunctor, + PD_INFER_META(phi::CreateInferMeta)); +REGISTER_OP_WITHOUT_GRADIENT(empty, ops::EmptyOp, ops::EmptyOpMaker, + ops::EmptyOpVarTypeInference, + EmptyInferShapeFunctor); diff --git a/paddle/fluid/operators/empty_op.cu.cc b/paddle/fluid/operators/empty_op.cu.cc deleted file mode 100644 index 22799e507aeff7940274f729b174f50bfd9132a5..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/empty_op.cu.cc +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/empty_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - empty, ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel); diff --git a/paddle/fluid/operators/empty_op.h b/paddle/fluid/operators/empty_op.h deleted file mode 100644 index cb466fffcd7c7358b6e84c18b7895a17b2eaa907..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/empty_op.h +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/utils.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class EmptyKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto dtype = static_cast( - context.Attr("dtype")); - - Tensor *out_tensor = context.Output("Out"); - - auto shape = GetShape(context); - out_tensor->Resize(shape); - - out_tensor->mutable_data(context.GetPlace(), - framework::TransToPhiDataType(dtype)); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/erf_op.cc b/paddle/fluid/operators/erf_op.cc index f68f670394871114369f8b05b7f958c03d5508d0..64274d098c0585c28196743c09d5e6c78c3fe37d 100644 --- a/paddle/fluid/operators/erf_op.cc +++ b/paddle/fluid/operators/erf_op.cc @@ -16,8 +16,10 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/erf_op.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -29,18 +31,6 @@ class ErfOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(%s) of ErfOp should not be null.", "X")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(%s) of ErfOp should not be null.", "Out")); - - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -116,28 +106,10 @@ class ErfGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(erf, ErfInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(erf, ops::ErfOp, ops::ErfOpMaker, ops::ErfGradOpMaker, - ops::ErfGradOpMaker); + ops::ErfGradOpMaker, + ErfInferShapeFunctor); REGISTER_OPERATOR(erf_grad, ops::ErfGradOp); -REGISTER_OP_CPU_KERNEL( - erf, ops::ErfKernel, - ops::ErfKernel, - ops::ErfKernel); -REGISTER_OP_CPU_KERNEL( - erf_grad, ops::ErfGradKernel, - ops::ErfGradKernel, - ops::ErfGradKernel); - -REGISTER_OP_CUDA_KERNEL( - erf, ops::ErfKernel, - ops::ErfKernel, - ops::ErfKernel); -REGISTER_OP_CUDA_KERNEL( - erf_grad, ops::ErfGradKernel, - ops::ErfGradKernel, - ops::ErfGradKernel); diff --git a/paddle/fluid/operators/erf_op.h b/paddle/fluid/operators/erf_op.h deleted file mode 100644 index 4780b2e7f5b28d4a743f6d35046891b30cbefd00..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/erf_op.h +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#ifndef _USE_MATH_DEFINES -#define _USE_MATH_DEFINES -#endif -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" - -namespace paddle { -namespace operators { - -template -class ErfKernel : public framework::OpKernel { - public: - virtual void Compute(const framework::ExecutionContext& context) const { - auto* out = context.Output("Out"); - auto* in = context.Input("X"); - out->mutable_data(in->place()); - - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*in); - auto& place = - *context.template device_context().eigen_device(); - EigenErf, T>::Eval(place, eigen_out, - eigen_in); - } -}; - -template -class ErfGradKernel : public framework::OpKernel { - public: - virtual void Compute(const framework::ExecutionContext& context) const { - auto* x = context.Input("X"); - auto* dout = - context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - - dx->mutable_data(dout->place()); - - auto eigen_x = framework::EigenVector::Flatten(*x); - auto eigen_dout = framework::EigenVector::Flatten(*dout); - auto eigen_dx = framework::EigenVector::Flatten(*dx); - auto& place = - *context.template device_context().eigen_device(); - EigenErfGrad, T>::Eval(place, eigen_dx, - eigen_x, eigen_dout); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/erfinv_op.cc b/paddle/fluid/operators/erfinv_op.cc index 3d409b4c4f6772bc7b234208e78c5088eeb2fc00..374b00792622f91edc0b66cebb278cc79f30dc66 100644 --- a/paddle/fluid/operators/erfinv_op.cc +++ b/paddle/fluid/operators/erfinv_op.cc @@ -73,8 +73,8 @@ DECLARE_INPLACE_OP_INFERER(ErfinvInplaceInferer, {"X", "Out"}); } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(erfinv, ErfinvInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(erfinv, ErfinvInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR( erfinv, paddle::operators::ErfinvOp, paddle::operators::ErfinvOpMaker, diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc index 119e514a49e28fb3295e36947664770889bbdd81..97a35a34f23e96707269482e29da13a15538cdca 100755 --- a/paddle/fluid/operators/expand_as_v2_op.cc +++ b/paddle/fluid/operators/expand_as_v2_op.cc @@ -121,37 +121,9 @@ REGISTER_OPERATOR(expand_as_v2, ops::ExpandAsV2Op, ops::ExpandAsV2OpMaker, ops::ExpandAsV2GradOpMaker); REGISTER_OPERATOR(expand_as_v2_grad, ops::ExpandAsV2GradOp, ops::ExpandAsV2GradNoNeedBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - expand_as_v2, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel); -REGISTER_OP_CPU_KERNEL( - expand_as_v2_grad, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -REGISTER_OP_CUDA_KERNEL( - expand_as_v2, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel); -REGISTER_OP_CUDA_KERNEL( - expand_as_v2_grad, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel); -#endif REGISTER_OP_VERSION(expand_as_v2) .AddCheckpoint( R"ROC(fix expand_as_v2 and add new input [Y])ROC", paddle::framework::compatible::OpVersionDesc().NewInput( - "Y", "Expand X according to the shape of Y")); \ No newline at end of file + "Y", "Expand X according to the shape of Y")); diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h index d7560efc5c1f1244ae4eed4c68c59a38287057ee..f09e7764eed3959c7f0ca700b953dbd0c2891d12 100755 --- a/paddle/fluid/operators/expand_as_v2_op.h +++ b/paddle/fluid/operators/expand_as_v2_op.h @@ -32,219 +32,5 @@ template using EigenTensor = framework::EigenTensor; -template -class ExpandAsV2Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto rank = context.Input("X")->dims().size(); - auto target_shape = context.Attr>("target_shape"); - auto target_rank = target_shape.size(); - PADDLE_ENFORCE_GE(target_rank, rank, - platform::errors::InvalidArgument( - "The rank (%d) of the input 'target_tensor' for " - "expand_as_v2 op must be greater than or equal to " - "the rank (%d) of the input 'x'.", - target_rank, rank)); - PADDLE_ENFORCE_GE(rank, 1, platform::errors::InvalidArgument( - "The rank (%d) of the input 'x' for " - "expand_as_v2 op must be positive.", - rank)); - PADDLE_ENFORCE_LE(target_rank, MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank (%d) of the input 'target_tensor' for " - "expand_as_v2 op must be less than or equal to %d.", - target_rank, MAX_RANK_SUPPORTED)); - - switch (target_rank) { - case 1: - ExpandAs<1>(context); - break; - case 2: - ExpandAs<2>(context); - break; - case 3: - ExpandAs<3>(context); - break; - case 4: - ExpandAs<4>(context); - break; - case 5: - ExpandAs<5>(context); - break; - case 6: - ExpandAs<6>(context); - break; - } - } - - protected: - template - void ExpandAs(const framework::ExecutionContext& context) const { - auto* in0 = context.Input("X"); - auto in_dims = in0->dims(); - auto target_shape = context.Attr>("target_shape"); - auto vec_in_dims = phi::vectorize(in_dims); - auto diff = target_shape.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - std::vector repeat_times(vec_in_dims.size()); - for (size_t i = 0; i < vec_in_dims.size(); ++i) { - PADDLE_ENFORCE_NE(target_shape[i], 0, - platform::errors::InvalidArgument( - "The value of target shape cannot be zero.")); - if (i < diff) { - PADDLE_ENFORCE_GT( - target_shape[i], 0, - platform::errors::InvalidArgument( - "The expanded size (%d) for non-existing dimensions must be " - "positive for expand_as_v2 op.", - target_shape[i])); - repeat_times[i] = target_shape[i]; - } else if (target_shape[i] > 0) { - if (vec_in_dims[i] != 1) { - PADDLE_ENFORCE_EQ( - vec_in_dims[i], target_shape[i], - platform::errors::InvalidArgument( - "The value (%d) of the non-singleton dimension does not match" - " the corresponding value (%d) in shape for expand_as_v2 op.", - vec_in_dims[i], target_shape[i])); - repeat_times[i] = 1; - } else { - repeat_times[i] = target_shape[i]; - } - } else { - PADDLE_ENFORCE_EQ( - target_shape[i], -1, - platform::errors::InvalidArgument( - "When the value in shape is negative for expand_as_v2 op, " - "only -1 is supported, but the value received is %d.", - target_shape[i])); - repeat_times[i] = 1; - } - } - auto* out0 = context.Output("Out"); - Eigen::DSizes bcast_dims; - for (size_t i = 0; i < repeat_times.size(); ++i) { - bcast_dims[i] = repeat_times[i]; - } - - framework::DDim new_in_dims = phi::make_ddim(vec_in_dims); - framework::DDim out_dims = phi::make_ddim(target_shape); - - out0->Resize(out_dims); - auto x = EigenTensor::From(*in0, new_in_dims); - out0->mutable_data(context.GetPlace()); - auto y = EigenTensor::From(*out0, out_dims); - auto& place = - *context.template device_context().eigen_device(); - EigenBroadcast, T, Rank>::Eval(place, y, x, - bcast_dims); - } -}; - -template -class ExpandAsV2GradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in0 = context.Input("X"); - auto target_shape = context.Attr>("target_shape"); - auto x_dims = in0->dims(); - auto vec_in_dims = phi::vectorize(x_dims); - auto diff = target_shape.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - std::vector repeat_times(vec_in_dims.size()); - for (size_t i = 0; i < vec_in_dims.size(); ++i) { - repeat_times[i] = target_shape[i] / vec_in_dims[i]; - } - std::vector reshape_dims_vec; - std::vector reduce_dims_vec; - for (size_t i = 0; i < repeat_times.size(); ++i) { - reduce_dims_vec.push_back(reshape_dims_vec.size()); - reshape_dims_vec.push_back(repeat_times[i]); - reshape_dims_vec.push_back(vec_in_dims[i]); - } - - int dims = reduce_dims_vec.size(); - bool just_copy = true; - for (size_t i = 0; i < repeat_times.size(); i++) { - if (repeat_times[i] != 1) { - just_copy = false; - break; - } - } - // no need reduce, just copy - if (just_copy) { - auto* in0 = context.Input(framework::GradVarName("Out")); - auto* out0 = context.Output(framework::GradVarName("X")); - out0->mutable_data(context.GetPlace()); - framework::TensorCopy(*in0, context.GetPlace(), context.device_context(), - out0); - } else { - PADDLE_ENFORCE_GE(dims, 1, - platform::errors::InvalidArgument( - "The rank of the input 'Out@GRAD' for " - "expand_as_v2_grad op must be greater than or " - "equal to 1, but the value received is %d.", - dims)); - PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'Out@GRAD' for " - "expand_as_v2_grad op must be less than or equal " - "to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, dims)); - switch (dims) { - case 1: - ExpandAsBackward<1>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 2: - ExpandAsBackward<2>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 3: - ExpandAsBackward<3>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 4: - ExpandAsBackward<4>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 5: - ExpandAsBackward<5>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 6: - ExpandAsBackward<6>(context, reshape_dims_vec, reduce_dims_vec); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Only support tensor with rank being between 1 and 6. But " - "received tensor's rank = %d.", - dims)); - } - } - } - - protected: - template - void ExpandAsBackward(const framework::ExecutionContext& context, - const std::vector& reshape_dims_vec, - const std::vector& reduce_dims_vec) const { - size_t reshape_size = reshape_dims_vec.size(); - size_t reduce_size = reduce_dims_vec.size(); - auto* in0 = context.Input(framework::GradVarName("Out")); - auto* out0 = context.Output(framework::GradVarName("X")); - out0->mutable_data(context.GetPlace()); - auto x_grad = EigenVector::Flatten(*out0); - Eigen::DSizes reshape_dims; - for (size_t i = 0; i < reshape_size; ++i) { - reshape_dims[i] = reshape_dims_vec[i]; - } - Eigen::DSizes reduce_dims; - for (size_t i = 0; i < reduce_size; ++i) { - reduce_dims[i] = reduce_dims_vec[i]; - } - auto out_grad = EigenVector::Flatten(*in0); - auto& place = - *context.template device_context().eigen_device(); - EigenBroadcastGrad, T, Dims>::Eval( - place, x_grad, out_grad, reduce_dims, reshape_dims); - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc index cdd4e1dbaae6a6a74bb11be44589877234021764..df00ae54c1036b1b0f0899eb0a949d58c398aa48 100644 --- a/paddle/fluid/operators/expand_op_npu_test.cc +++ b/paddle/fluid/operators/expand_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/exponential_op.cc b/paddle/fluid/operators/exponential_op.cc index ee456dcdafbc51d547e7beacc4e4e79f98738b88..1a48a6767852e138e7725a68ca4ffc56de8234be 100644 --- a/paddle/fluid/operators/exponential_op.cc +++ b/paddle/fluid/operators/exponential_op.cc @@ -76,7 +76,7 @@ class ExponentialKernel auto engine = gen->GetCPUEngine(); std::uniform_real_distribution uniform(0.0, 1.0); - distribution::exponential_transform trans(lambda); + phi::funcs::exponential_transform trans(lambda); for (int64_t i = 0; i < size; ++i) { out_data[i] = trans(uniform(*engine)); } diff --git a/paddle/fluid/operators/exponential_op.cu b/paddle/fluid/operators/exponential_op.cu index 8b989501e4f4248b0c2e3b23e1e75a4865b08588..d5abbf9a26afe6bcbbd8549f59d632fc4e53fec2 100644 --- a/paddle/fluid/operators/exponential_op.cu +++ b/paddle/fluid/operators/exponential_op.cu @@ -26,9 +26,9 @@ class ExponentialKernel auto& dev_cxt = ctx.template device_context(); T lambda = static_cast(ctx.Attr("lambda")); - distribution::uniform_distribution dist; - distribution::exponential_transform trans(lambda); - distribution::distribution_and_transform(dev_cxt, out, dist, trans); + phi::funcs::uniform_distribution dist; + phi::funcs::exponential_transform trans(lambda); + phi::funcs::distribution_and_transform(dev_cxt, out, dist, trans); } }; diff --git a/paddle/fluid/operators/exponential_op.h b/paddle/fluid/operators/exponential_op.h index fbcabc594db0814da1ec50934a0f02514dc208be..7ded174a9f47ede48a49b19b25539867ce344fb0 100644 --- a/paddle/fluid/operators/exponential_op.h +++ b/paddle/fluid/operators/exponential_op.h @@ -17,7 +17,7 @@ #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/distribution_helper.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { diff --git a/paddle/fluid/operators/eye_op.cc b/paddle/fluid/operators/eye_op.cc index 8f8a0f174a79f13f0bee7aa7b425f8c645e15687..537c218d357b67980216ab3053707b8adb867c01 100644 --- a/paddle/fluid/operators/eye_op.cc +++ b/paddle/fluid/operators/eye_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/nullary.h" namespace paddle { namespace operators { @@ -21,24 +24,6 @@ class EyeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of EyeOP should not be null.")); - auto num_rows = ctx->Attrs().Get("num_rows"); - PADDLE_ENFORCE_EQ( - num_rows >= 0, true, - platform::errors::InvalidArgument( - "The value of Input(num_rows) should be non-negative int.")); - auto num_columns = ctx->Attrs().Get("num_columns"); - if (num_columns == -1) num_columns = num_rows; - PADDLE_ENFORCE_EQ( - num_columns >= 0, true, - platform::errors::InvalidArgument( - "The value of Input(num_columns) should be non-negative int.")); - ctx->SetOutputDim("Out", {num_rows, num_columns}); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -82,8 +67,11 @@ Return an identity tensor whose shape is [num_rows, num_columns]. } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(eye, EyeInferShapeFunctor, + PD_INFER_META(phi::EyeInferMeta)); REGISTER_OPERATOR( eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + EyeInferShapeFunctor); diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu index 0eb84f18f25f03b1fd0310c5815ee342ff835a6f..27a235765227f15dd412dcd6ad55f2a24471c6da 100644 --- a/paddle/fluid/operators/feed_forward_test.cu +++ b/paddle/fluid/operators/feed_forward_test.cu @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/fused/attn_feed_forward.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace framework = paddle::framework; @@ -29,6 +30,11 @@ namespace platform = paddle::platform; USE_OP(matmul); USE_OP_ITSELF(elementwise_add); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); +#endif + // get paddle matmul op results as baseline template void GetLinearOp(const std::vector &x, const std::vector &y, diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc index 79018f2a97448a8c6265a969dad37bce77d1b7ee..cb03add3143278260d41c3893e7adad976908d4e 100644 --- a/paddle/fluid/operators/fill_constant_op_npu.cc +++ b/paddle/fluid/operators/fill_constant_op_npu.cc @@ -65,7 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel { tensor_value.mutable_data({1}, ctx.GetPlace()); FillNpuTensorWithConstant(&tensor_value, value); NpuOpRunner runner; -#if (CANN_VERSION_CODE >= 503003) +#if (CANN_VERSION_CODE >= 503003 && CANN_VERSION_CODE < 504001) runner.SetType("FillD") .AddInput(tensor_value) .AddOutput(*out_var) diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..508730c3c7335dbad8cf70417d2c19be4a8480a2 --- /dev/null +++ b/paddle/fluid/operators/filter_by_instag_op.cu @@ -0,0 +1,655 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11000 + +#if defined(PADDLE_WITH_CUDA) +#include +#endif + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/enforce.h" + +#include "paddle/fluid/operators/filter_by_instag_op.h" + +#if defined(PADDLE_WITH_CUDA) +namespace cg = cooperative_groups; +#endif + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using SelectedRows = phi::SelectedRows; +using LoDTensor = framework::LoDTensor; + +template +using Vector = framework::Vector; + +#define WARP_SIZE 32 +#define MAX_WARP_NUM 32 + +#if defined(PADDLE_WITH_CUDA) + +template +__global__ void filter_copy_fuse_kernel( + const size_t N, const int ins_per_thread, size_t* x1_lods_data, + size_t* x2_lods_data, const int64_t* x2_data, const int64_t* x3_data, + int64_t filter_tag_size, T* out_data, int64_t* map_data, + size_t* map_lods_data, size_t* out_lods_data, size_t* out_idx_data, + const T* x1_data, int x1_embed_size, float* loss_weight_data, + float fill_value) { + // N is instance num + // one threads for ins_per_thread instances + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + cg::thread_block b = cg::this_thread_block(); + cg::thread_block_tile g = cg::tiled_partition(b); + + int gid = idx / WARP_SIZE; + + // general use + int thread_num = + (N + (ins_per_thread - 1)) / ins_per_thread; // real thread num + int total_warp_num = thread_num / WARP_SIZE; // 30 + int remain_thread_num = thread_num % WARP_SIZE; // 16 + + int warp_thread_num = -1; + if (gid < total_warp_num) { + warp_thread_num = WARP_SIZE; + } else { + warp_thread_num = remain_thread_num; + } + + int group_num = total_warp_num; + if (remain_thread_num > 0) { + group_num = total_warp_num + 1; + } + + if (gid >= group_num) return; + + int ins_start = idx * ins_per_thread; + int ins_end = (idx + 1) * ins_per_thread; + + if (N < ins_end) ins_end = N; + + /* + if (!x1_lods_filled) { + for (int p = ins_start; p < ins_end; p++) { + x1_lods_data[p] = p; + } + if (idx == 0) { + x1_lods_data[N] = N; + } + } + + if (!x2_lods_filled) { + for (int p = ins_start; p < ins_end; p++) { + x2_lods_data[p] = p; + } + if (idx == 0) { + x2_lods_data[N] = N; + } + } + + if (!x1_lods_filled || !x2_lods_filled) { + b.sync(); + } + */ + + int flag_data[5]; + int prefix_sum_data[5]; + int prefix_sum_data2[5]; + + __shared__ int shr[MAX_WARP_NUM]; + __shared__ int shr2[MAX_WARP_NUM]; + __shared__ int shr3[MAX_WARP_NUM]; + + for (int p = ins_start; p < ins_end; p++) { + int ins_tag_start = x2_lods_data[p]; + int ins_tag_end = x2_lods_data[p + 1]; + flag_data[p - ins_start] = 0; + // filter logic + int i = ins_tag_start; + for (; i < ins_tag_end; i++) { + int64_t ins_tag = x2_data[i]; + int j = 0; + for (; j < filter_tag_size; j++) { + if (x3_data[j] == ins_tag) break; + } + // if ins_tag in filter tag + if (j < filter_tag_size) { + flag_data[p - ins_start] = 1; + break; + } + } + } + + int sum_addr = 0; + int sum_flag = 0; + int sum_out_lods = 0; + + int local_addr = 0; + int local_flag = 0; + int local_out_lods = 0; + + if (ins_start < ins_end) { + for (int p = ins_start; p < ins_end; p++) { + int previous = -1; + if (p == ins_start) { + previous = 0; + } else { + previous = prefix_sum_data[p - ins_start - 1]; + } + + prefix_sum_data[p - ins_start] = + previous + + flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]); + } + + local_addr = prefix_sum_data[ins_end - 1 - ins_start]; + sum_addr = local_addr; + + // flag + // local_flag = 0; + for (int p = ins_start; p < ins_end; p++) { + local_flag += flag_data[p - ins_start]; + } + sum_flag = local_flag; + + for (int p = ins_start; p < ins_end; p++) { + local_out_lods += + flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]); + } + + sum_out_lods = local_out_lods; + } + + // 32 threads + for (int i = 1; i < warp_thread_num; i *= 2) { + int temp_addr = g.shfl_up(sum_addr, i); + int temp_flag = g.shfl_up(sum_flag, i); + int temp_out_lods = g.shfl_up(sum_out_lods, i); + + if (g.thread_rank() >= i) { + sum_addr += temp_addr; + sum_flag += temp_flag; + sum_out_lods += temp_out_lods; + } + } + + if (g.thread_rank() == warp_thread_num - 1) { + shr[gid] = sum_addr; + shr2[gid] = sum_flag; + shr3[gid] = sum_out_lods; + } + + b.sync(); + + int sum_addr2 = 0; + int sum_flag2 = 0; + int sum_out_lods2 = 0; + + // communicate between warp + if (g.thread_rank() < group_num) { + sum_addr2 = shr[g.thread_rank()]; + sum_flag2 = shr2[g.thread_rank()]; + sum_out_lods2 = shr3[g.thread_rank()]; + } + + for (int i = 1; i < group_num; i *= 2) { + int temp_addr2 = g.shfl_up(sum_addr2, i); + int temp_flag2 = g.shfl_up(sum_flag2, i); + int temp_out_lods2 = g.shfl_up(sum_out_lods2, i); + + if (g.thread_rank() >= i) { + sum_addr2 += temp_addr2; + sum_flag2 += temp_flag2; + sum_out_lods2 += temp_out_lods2; + } + } + + int sum_addr3 = g.shfl(sum_addr2, gid); + int sum_flag3 = g.shfl(sum_flag2, gid); + int sum_out_lods3 = g.shfl(sum_out_lods2, gid); + + int p_flag; + int p_addr; + int p_out_lods; + + if (ins_start < ins_end) { + p_addr = sum_addr3 - shr[gid] + sum_addr - local_addr; + p_flag = sum_flag3 - shr2[gid] + sum_flag - local_flag; + p_out_lods = sum_out_lods3 - shr3[gid] + sum_out_lods - local_out_lods; + + for (int p = ins_start; p < ins_end; p++) { + if (ins_start == p) { + prefix_sum_data2[p - ins_start] = p_addr; + } else { + prefix_sum_data2[p - ins_start] = + prefix_sum_data2[p - ins_start - 1] + + flag_data[p - ins_start - 1] * + (x1_lods_data[p] - x1_lods_data[p - 1]); + } + } + + if (gid == 0 && g.thread_rank() == group_num - 1) { + *out_idx_data = (sum_flag2 + 1); + map_lods_data[sum_flag2] = sum_flag2; + } + } + + int sum_out_lods4 = g.shfl(sum_out_lods2 + 1, group_num - 1); + + if (ins_start < ins_end) { + int out_lods_idx = p_flag + 1; + + // ins_start = 1 + // BUG fix + for (int p = ins_start; p < ins_end; p++) { + if (flag_data[p - ins_start] == 1) { + // batch_len = 2 + // batch_len = 4 + size_t batch_len = x1_lods_data[p + 1] - x1_lods_data[p]; + // t = 0 + // t = 1 + int t = out_lods_idx - 1; + // out_lods_data[0] = 0; + int previous; + + if (out_lods_idx == p_flag + 1) { + // out_lods_data[t] = p_out_lods; + previous = p_out_lods; + } else { + previous = out_lods_data[t]; + } + + map_data[t * 3] = (int64_t)previous; + map_data[t * 3 + 1] = x1_lods_data[p]; + map_lods_data[t] = t; + out_lods_data[out_lods_idx] = previous + batch_len; + map_data[t * 3 + 2] = batch_len; + out_lods_idx++; + } + } + + // fill loss_weight_data + if (sum_out_lods4 > 1) { + int out_data_num = sum_out_lods4 - 1; + int out_start = ins_start; + + if (out_start < out_data_num) { + int out_end = ins_end >= out_data_num ? out_data_num : ins_end; + for (int p = out_start; p < out_end; p++) { + loss_weight_data[p] = fill_value; + } + } + } + + for (int p = ins_start; p < ins_end; p++) { + // copy logic + if (flag_data[p - ins_start] == 1) { + auto output_start_idx = prefix_sum_data2[p - ins_start]; + T* dst = out_data + output_start_idx * x1_embed_size; + + const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size; + const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size; + + // optimized + for (const T *j = src_start; j != src_end; dst++, j++) { + *dst = *j; + } + } + } + } + + b.sync(); +} + +template +__global__ void copy_grad_kernel(const size_t N, const int ins_per_thread, + const T* out_grad_data, T* x1_grad_data, + const int64_t* map_data, int x1_embed_size) { + // N is instance num + // one threads for one instance + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int ins_start = idx * ins_per_thread; + int ins_end = (idx + 1) * ins_per_thread; + + if (ins_start >= N) { + return; + } + if (ins_end > N) ins_end = N; + + for (int p = ins_start; p < ins_end; p++) { + T* dst = x1_grad_data + map_data[p * 3 + 1] * x1_embed_size; + const T* src_start = out_grad_data + map_data[p * 3] * x1_embed_size; + const T* src_end = + out_grad_data + (map_data[p * 3] + map_data[p * 3 + 2]) * x1_embed_size; + + for (const T *j = src_start; j != src_end; dst++, j++) { + *dst = *j; + } + } +} + +#endif + +template +class FilterByInstagGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { +#if defined(PADDLE_WITH_CUDA) + + auto gpu_place = context.GetPlace(); + + gpuStream_t current_stream = context.cuda_device_context().stream(); + + int max_thread_num_per_block = 1024; + // context.cuda_device_context().GetMaxThreadsPerBlock(); + // X1 is global FC output + // Dim [batch size, embedding size] + const LoDTensor* x1 = context.Input("Ins"); + bool is_lod = context.Attr("is_lod"); + + int is_x1_lod = -1; + if (is_lod) + is_x1_lod = 1; + else + is_x1_lod = 0; + + int64_t out_val_if_empty = context.Attr("out_val_if_empty"); + size_t x1_embed_size = x1->dims()[1]; + // X2 is ins tag list + // LoD [[0, Sum(ins1), Sum(ins1, ins2), ... ]] + const LoDTensor* x2 = context.Input("Ins_tag"); + // expected auto = const int64_t + const int64_t* x2_data = x2->data(); + + // X3 is local fc tag list + // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]] + const Tensor* x3 = context.Input("Filter_tag"); + const int64_t* x3_data = x3->data(); + + // int x2_lods_filled = 1; + + Vector x2_lods; + // Vector, in GPU + if (x2->lod().size() != 0) { // lod_level = 1 + x2_lods = x2->lod()[0]; + // x2_lods_filled = 1; + + } else { // lod_level = 0 + const size_t x2_lods_size = x2->dims()[0]; + // x2_lods.resize(x2->dims()[0] + 1); + // move to cuda + x2_lods.push_back(0); + for (size_t i = 0; i < x2_lods_size; i++) { + x2_lods.push_back(i + 1); + } + } + + const size_t x2_lods_size = x2_lods.size() - 1; + paddle::framework::MixVector mixv_x2_lods(&x2_lods); + + size_t* x2_lods_data = mixv_x2_lods.CUDAMutableData(gpu_place); + + // Vector, in GPU + // int x1_lods_filled = 1; + Vector x1_lods; + + if (!is_x1_lod) { + // move to cuda + // x1_lods.resize(x1->dims()[0] + 1); + x1_lods.push_back(0); + for (int i = 0; i < x1->dims()[0]; i++) { + x1_lods.push_back(i + 1); + } + } else { + // x1_lods = context.Input("Ins")->lod()[0]; + // new: lod_level=0 => lod() return {} + if (x1->lod().size() != 0) { // lod_level = 1 + // x1_lods_filled = 1; + x1_lods = x1->lod()[0]; + } else { // lod_level = 0 + // x1_lods.resize(x1->dims()[0] + 1); + // move to cuda + x1_lods.push_back(0); + for (int i = 0; i < x1->dims()[0]; i++) { + x1_lods.push_back(i + 1); + } + } + } + + paddle::framework::MixVector mixv_x1_lods(&x1_lods); + + size_t* x1_lods_data = mixv_x1_lods.CUDAMutableData(gpu_place); + auto* x1_data = x1->data(); + + // set output value + // for those whose ins been dropout, set 0 for whole lines. + // otherwise, copy whole line + // Dim [local fc count, batch size, embedding size] + LoDTensor* out = context.Output("Out"); + LoDTensor* map = context.Output("IndexMap"); + LoDTensor* loss_weight = context.Output("LossWeight"); + + int out_first = x1_lods.back(); + // int out_first = x1->dims()[0]; + // if (x1_lods_filled) { + // out_first = x1_lods.back(); + // } + + out->Resize(phi::make_ddim({(int64_t)out_first, (int64_t)x1_embed_size})); + map->Resize(phi::make_ddim({(int64_t)x2_lods_size, 3})); + loss_weight->Resize(phi::make_ddim({(int64_t)x2_lods_size, 1})); + + T* out_data = out->mutable_data(gpu_place); + int64_t* map_data = map->mutable_data(gpu_place); + float* loss_weight_data = loss_weight->mutable_data(gpu_place); + + int block_size = max_thread_num_per_block; + int ins_per_thread = (x2_lods_size + block_size - 1) / block_size; + dim3 block_dim(block_size); + dim3 grid_dim(1); + + Vector out_lods(x2_lods_size + 1, 0); + Vector map_lods(x2_lods_size + 1, 0); + + paddle::framework::MixVector mixv_out_lods(&out_lods); + paddle::framework::MixVector mixv_map_lods(&map_lods); + + // thrust::device_vector out_idx(1); + Vector out_idx(1, 0); + paddle::framework::MixVector mixv_out_idx(&out_idx); + + size_t* out_idx_data = mixv_out_idx.CUDAMutableData(gpu_place); + size_t* out_lods_data = mixv_out_lods.CUDAMutableData(gpu_place); + size_t* map_lods_data = mixv_map_lods.CUDAMutableData(gpu_place); + + float fill_value = 1.0; + + filter_copy_fuse_kernel<<>>( + x2_lods_size, ins_per_thread, x1_lods_data, x2_lods_data, x2_data, + x3_data, x3->numel(), out_data, map_data, map_lods_data, out_lods_data, + out_idx_data, x1_data, x1_embed_size, loss_weight_data, fill_value); + + platform::GpuStreamSync(current_stream); + + mixv_out_lods.resize(mixv_out_idx[0]); + + if (mixv_out_lods.size() - 1 > 0) { + out->Resize(phi::make_ddim( + {(int64_t)mixv_out_lods.back(), (int64_t)x1_embed_size})); + + map->Resize(phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 3})); + loss_weight->Resize( + phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 1})); + + } else { + out->Resize(phi::make_ddim({1, (int64_t)x1_embed_size})); + map->Resize(phi::make_ddim({1, 3})); + loss_weight->Resize(phi::make_ddim({1, 1})); + } + + if (mixv_out_lods.size() - 1 > 0) { + map_lods.resize(mixv_out_lods.size()); + + mixv_map_lods.CopyToCPU(); + + std::vector> map_lod_info; + map_lod_info.emplace_back(map_lods); + + map->set_lod(map_lod_info); + loss_weight->set_lod(map_lod_info); + + mixv_out_lods.CopyToCPU(); + std::vector> out_lod_info; + out_lod_info.emplace_back(out_lods); + out->set_lod(out_lod_info); + + } else { + Vector map_lods(2, 0); + paddle::framework::MixVector mixv_map_lods(&map_lods); + thrust::device_ptr map_data_ptr(map_data); + + map_data_ptr[0] = 0; + map_data_ptr[1] = 1; + map_data_ptr[2] = 1; + + mixv_map_lods[0] = 0; + mixv_map_lods[1] = 1; + mixv_out_lods.push_back(1); + + mixv_map_lods.CopyToCPU(); + mixv_out_lods.CopyToCPU(); + + std::vector> map_lod_info; + map_lod_info.emplace_back(map_lods); + map->set_lod(map_lod_info); + + loss_weight->set_lod(map_lod_info); + + std::vector> out_lod_info; + out_lod_info.emplace_back(out_lods); + out->set_lod(out_lod_info); + + thrust::device_ptr out_data_ptr(out_data); + + // gpu kernel + if (std::is_same::value) { + thrust::fill(out_data_ptr, out_data_ptr + out->numel(), + static_cast(out_val_if_empty)); + } else if (std::is_same::value) { + thrust::fill(out_data_ptr, out_data_ptr + out->numel(), + static_cast(out_val_if_empty)); + } else if (std::is_same::value) { + thrust::fill(out_data_ptr, out_data_ptr + out->numel(), + static_cast(out_val_if_empty)); + } else { + thrust::fill(out_data_ptr, out_data_ptr + out->numel(), + static_cast(out_val_if_empty)); + } + + thrust::device_ptr loss_weight_data_ptr(loss_weight_data); + loss_weight_data_ptr[0] = 0; + } + +#endif + } +}; + +template +class FilterByInstagGradGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { +#if defined(PADDLE_WITH_CUDA) + + auto gpu_place = context.GetPlace(); + gpuStream_t current_stream = context.cuda_device_context().stream(); + auto max_thread_num_per_block = 1024; + auto* output_grad = context.Input(framework::GradVarName("Out")); + auto* x1_grad = context.Output(framework::GradVarName("Ins")); + auto* loss_weight = context.Input("LossWeight"); + auto* mmap = context.Input("IndexMap"); + auto* x1 = context.Input("Ins"); + + x1_grad->set_lod(context.Input("Ins")->lod()); + x1_grad->Resize(x1->dims()); + + auto* mmap_data = mmap->data(); + // expected auto = T + auto* output_grad_data = output_grad->data(); + auto* loss_weight_data = loss_weight->data(); + + // expected auto = T + auto* x1_grad_data = x1_grad->mutable_data(gpu_place); + thrust::device_ptr x1_grad_data_ptr(x1_grad_data); + thrust::device_ptr loss_weight_data_ptr(loss_weight_data); + + thrust::fill(x1_grad_data_ptr, + x1_grad_data_ptr + x1->dims()[0] * x1->dims()[1], 0); + + if (loss_weight->numel() != 1 || loss_weight_data_ptr[0] != 0) { + auto output_dims = output_grad->dims(); + int x1_embed_size = output_dims[1]; + + // one thread for multi-instances + int block_size = max_thread_num_per_block; + + size_t N = mmap->dims()[0]; + dim3 block_dim(block_size); + + dim3 grid_dim((N + block_size - 1) / block_size); + + const int ins_per_thread = 1; + + copy_grad_kernel<<>>( + N, ins_per_thread, output_grad_data, x1_grad_data, mmap_data, + x1_embed_size); + + cudaStreamSynchronize(current_stream); + } + +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL(filter_by_instag, ops::FilterByInstagGPUKernel, + ops::FilterByInstagGPUKernel, + ops::FilterByInstagGPUKernel, + ops::FilterByInstagGPUKernel); + +REGISTER_OP_CUDA_KERNEL(filter_by_instag_grad, + ops::FilterByInstagGradGPUKernel, + ops::FilterByInstagGradGPUKernel, + ops::FilterByInstagGradGPUKernel, + ops::FilterByInstagGradGPUKernel); diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h index deb2aa96b539e360cf2edad97b21cb6e9ddba066..3abc980ceaafc3719c13cad51c346282be2c694f 100644 --- a/paddle/fluid/operators/filter_by_instag_op.h +++ b/paddle/fluid/operators/filter_by_instag_op.h @@ -61,7 +61,20 @@ class FilterByInstagKernel : public framework::OpKernel { // expected auto = const int64_t auto* x2_data = x2->data(); // e.g get [0, 1, 2, 3, ...] - size_t x2_lods_size = x2->dims()[0]; + // size_t x2_lods_size = x2->dims()[0]; + // size_t instag_num_per_ins = x2->dims()[1]; + + Vector x2_lods(1, 0); + if (x2->lod().size() != 0) { // lod_level = 1 + x2_lods = x2->lod()[0]; + } else { // lod_level = 0 + const size_t x2_lods_size = x2->dims()[0]; + const size_t instag_num_per_ins = x2->dims()[1]; + for (size_t i = 0; i < x2_lods_size; i++) { + x2_lods.push_back(x2_lods.back() + instag_num_per_ins); + } + } + Vector x1_lods(1, 0); if (!is_x1_lod) { for (int i = 0; i < x1->dims()[0]; i++) { @@ -79,8 +92,8 @@ class FilterByInstagKernel : public framework::OpKernel { } std::unordered_map mmap_aux; Vector out_lods(1, 0); - for (size_t i = 0; i < x2_lods_size; i++) { - for (size_t j = i; j < i + 1; j++) { + for (size_t i = 0; i < x2_lods.size() - 1; i++) { + for (size_t j = x2_lods[i]; j < x2_lods[i + 1]; j++) { if (filter_tag.find(x2_data[j]) != filter_tag.end()) { size_t batch_len = x1_lods[i + 1] - x1_lods[i]; mmap_aux[out_lods.back()] = x1_lods[i]; @@ -165,8 +178,10 @@ class FilterByInstagKernel : public framework::OpKernel { out_data[oi] = (int32_t)out_val_if_empty; } else if (std::is_same::value) { out_data[oi] = (int64_t)out_val_if_empty; - } else { + } else if (std::is_same::value) { out_data[oi] = static_cast(out_val_if_empty); + } else { + out_data[oi] = static_cast(out_val_if_empty); } } loss_weight_data[0] = 0; diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc index 40ec9aef190ff4bacd52b19a1c0b12300a35b61e..92f59e118c3b7bb66a2c5c76d66109ddf04ee076 100644 --- a/paddle/fluid/operators/fold_op.cc +++ b/paddle/fluid/operators/fold_op.cc @@ -95,6 +95,17 @@ class FoldOp : public framework::OperatorWithKernel { "but recieved strides_height: %d strides_width: %d.", strides[0], strides[1])); // check dilations + PADDLE_ENFORCE_GT(output_height, 1, + platform::errors::InvalidArgument( + "The `output_height` should be greater than one, " + "but recieved output_height: %d .", + output_height)); + PADDLE_ENFORCE_GT(output_width, 1, + platform::errors::InvalidArgument( + "The `output_width` should be greater than one, " + "but recieved output_width: %d .", + output_width)); + // check output size PADDLE_ENFORCE_GT( dilation_height, 0, platform::errors::InvalidArgument( @@ -146,7 +157,7 @@ class FoldOp : public framework::OperatorWithKernel { output_width)); PADDLE_ENFORCE_EQ( - blocks_height * blocks_width, in_dims[1], + blocks_height * blocks_width, in_dims[2], platform::errors::InvalidArgument( "Given input output_size (%d, %d), " "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), " @@ -156,6 +167,15 @@ class FoldOp : public framework::OperatorWithKernel { strides[0], strides[1], dilations[0], dilations[1], blocks_height, blocks_width, blocks_height * blocks_width, in_dims[2])); + PADDLE_ENFORCE_EQ( + in_dims[1] % (kernel_sizes[0] * kernel_sizes[1]), 0, + platform::errors::InvalidArgument( + "Expected size of input's dimension 1 to be divisible by the" + "product of kernel_size, but got input.size(1)=%d and " + "kernel_size=( %d" + ", %d).", + in_dims[1], kernel_sizes[0], kernel_sizes[1])); + out_dims.push_back(output_height); out_dims.push_back(output_width); ctx->SetOutputDim("Y", phi::make_ddim(out_dims)); diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 67287afa6ae5059f8af3dcdbd6910ca35db7c3c0..80e7f5c001d4b8139b538570c42fcd8d2604961b 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -19,7 +19,8 @@ register_operators(EXCLUDES fused_attention_op fused_transformer_op fused_feedforward_op - resnet_unit_op) + resnet_unit_op + fused_gemm_epilogue_op) # fusion_gru_op does not have CUDA kernel op_library(fusion_gru_op) @@ -79,4 +80,8 @@ if (WITH_GPU OR WITH_ROCM) cc_test(test_cudnn_norm_conv SRCS cudnn_norm_conv_test.cc DEPS conv_op blas im2col vol2col depthwise_conv eigen_function tensor op_registry device_context generator memory) cc_test(test_cudnn_bn_add_relu SRCS cudnn_bn_add_relu_test.cc DEPS batch_norm_op fused_bn_add_activation_op tensor op_registry device_context generator memory) endif() + + if (CUDA_VERSION GREATER_EQUAL 11.6) + op_library(fused_gemm_epilogue_op) + endif() endif() diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h index 20801d2243fb395b250f8416f1e2f5ba6a1423a4..3a2de0c4a093514a1c40321ab7dad61011709204 100644 --- a/paddle/fluid/operators/fused/attn_bias_add.cu.h +++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h @@ -89,9 +89,9 @@ __global__ void BroadcastKernelBinary( template void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, int m, int n, const T* in0, const T* in1, T* out) { - int in_vec_size = std::min(platform::GetVectorizedSize(in0), - platform::GetVectorizedSize(in1)); - int out_vec_size = std::min(4, platform::GetVectorizedSize(out)); + int in_vec_size = + std::min(phi::GetVectorizedSize(in0), phi::GetVectorizedSize(in1)); + int out_vec_size = std::min(4, phi::GetVectorizedSize(out)); int vec_size = std::min(out_vec_size, in_vec_size); int numel = m * n; @@ -191,9 +191,9 @@ void SetConfigForColumnReduce(const int max_threads, const int reduce_num, int num_block = (max_threads / left_num); if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) { - *blocking_size = phi::kernels::details::GetLastPow2(reduce_num / num_block); + *blocking_size = phi::funcs::details::GetLastPow2(reduce_num / num_block); if (*blocking_size <= 1) { - *blocking_size = phi::kernels::details::GetLastPow2(sqrt(reduce_num)); + *blocking_size = phi::funcs::details::GetLastPow2(sqrt(reduce_num)); } else if (*blocking_size * 2 < reduce_num) { *blocking_size *= 2; } diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu index bb5b363fe83995faf69f61b0a1a1693ff758fa37..5dbf4fb88b2a78838ce0fe95be653f68f4805416 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cu +++ b/paddle/fluid/operators/fused/conv_fusion_op.cu @@ -17,8 +17,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/conv_op.h" -#include "paddle/fluid/operators/math/padding.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/kernels/funcs/padding.h" DECLARE_int64(cudnn_exhaustive_search_times); @@ -86,7 +86,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { in_data_dims, strides, ksize); int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); Tensor transformed_input; std::vector padding_common(data_dim, 0); @@ -118,13 +118,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_input_channel, pad_value, &transformed_input); } break; case 5: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_input_channel, pad_value, &transformed_input); } break; default: diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc index 6119af18ce153ac2bcd5d45a69ab7b5d86a3cc10..b3ac3606eaf8ee843a2be98b7a237037afaf524f 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc +++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc @@ -32,7 +32,7 @@ namespace platform = paddle::platform; namespace op = paddle::operators; using Tensor = paddle::framework::Tensor; -USE_OP(batch_norm); +USE_OP_ITSELF(batch_norm); USE_CUDA_ONLY_OP(fused_bn_add_activation); USE_CUDA_ONLY_OP(fused_bn_add_activation_grad); diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index 1864bdbb86667290474d297cc481f5d6352c8022..a80f590aa495db8090a30118ed4128843c0f8860 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace framework = paddle::framework; @@ -29,10 +30,10 @@ namespace platform = paddle::platform; namespace op = paddle::operators; using Tensor = paddle::framework::Tensor; -USE_OP(conv2d); -USE_OP(conv2d_grad); -USE_OP_DEVICE_KERNEL(conv2d, CUDNN); -USE_OP_DEVICE_KERNEL(conv2d_grad, CUDNN); +USE_OP_ITSELF(conv2d); +USE_OP_ITSELF(conv2d_grad); +PD_DECLARE_KERNEL(conv2d, GPUDNN, ALL_LAYOUT); +PD_DECLARE_KERNEL(conv2d_grad, GPUDNN, ALL_LAYOUT); template void InitRandomTensor(const std::vector &dims, @@ -404,8 +405,18 @@ TEST(CudnnNormConvFp16, K1S1) { CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.CheckForward(1e-3, true); - test.CheckBackward(1e-3, true); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3, true), + paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3, true)); + } } // test for fp16, kernel = 3, output_channels = input_channels @@ -420,8 +431,18 @@ TEST(CudnnNormConvFp16, K3S1) { CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.CheckForward(1e-3, true); - test.CheckBackward(1e-3, true); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3, true), + paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3, true)); + } } // test for fp16, kernel = 1, output_channels = input_channels * 4 @@ -436,8 +457,18 @@ TEST(CudnnNormConvFp16, K1S1O4) { CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.CheckForward(1e-3, true); - test.CheckBackward(1e-3, true); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3, true), + paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3, true)); + } } // test for fp16, kernel = 1, stride = 2, output_channels = input_channels * 4 diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h index 020277675797358bf87a58ac108e6eaaddb26ccc..54e4cbdc1624921e6946210a6a192d10fcbdb7dd 100644 --- a/paddle/fluid/operators/fused/fmha_ref.h +++ b/paddle/fluid/operators/fused/fmha_ref.h @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/operators/transpose_op.cu.h" +#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" namespace paddle { @@ -69,20 +70,21 @@ class FMHARef { ~FMHARef() {} void ComputeForward(const Tensor& qkv_input_tensor, + const Tensor* cache_kv_tensor, const Tensor* src_mask_tensor, - Tensor* transpose_2_out_tensor, Tensor* qk_out_tensor, + Tensor* transpose_2_out_tensor, + Tensor* cache_kv_out_tensor, Tensor* qk_out_tensor, Tensor* src_mask_out_tensor, Tensor* softmax_out_tensor, Tensor* dropout_mask_out_tensor, Tensor* dropout_out_tensor, Tensor* qktv_out_tensor, Tensor* fmha_out_tensor) { // input shape: [bs, seq_len, 3, num_head, head_dim] - // transpose with perm [2, 0, 1, 3, 4], + // transpose with perm [2, 0, 3, 1, 4], // output_shape: [3, bs, num_head, seq_len, head_dim] int ndims = 5; std::vector perm_1 = {2, 0, 3, 1, 4}; TransposeGPUKernelDriver(dev_ctx_, ndims, qkv_input_tensor, perm_1, transpose_2_out_tensor); - T* qkv_data = transpose_2_out_tensor->data(); T* qk_out_data = qk_out_tensor->data(); T* qktv_out_data = qktv_out_tensor->data(); @@ -90,11 +92,30 @@ class FMHARef { T* dropout_out_data = dropout_out_tensor->data(); T* fmha_out_data = fmha_out_tensor->data(); - int q_size = batch_size_ * seq_len_ * num_head_ * head_dim_; - int k_size = q_size; + auto out_seq_len = seq_len_; + if (cache_kv_tensor) { + // kv [2, bs, num_head, seq_len, head_dim] + auto kv_tensor = transpose_2_out_tensor->Slice(1, 3); + phi::funcs::ConcatFunctor concat; + // out [2, bs, num_head, cache_seq_len + seq_len, head_dim] + concat(dev_ctx_, {*cache_kv_tensor, kv_tensor}, 3, cache_kv_out_tensor); + out_seq_len = cache_kv_out_tensor->dims()[3]; + } + + int64_t q_size = batch_size_ * seq_len_ * num_head_ * head_dim_; T* q_ptr = qkv_data; - T* k_ptr = q_ptr + q_size; - T* v_ptr = k_ptr + k_size; + T* k_ptr = nullptr; + T* v_ptr = nullptr; + + if (cache_kv_tensor) { + int64_t k_size = cache_kv_out_tensor->numel() / 2; + k_ptr = cache_kv_out_tensor->data(); + v_ptr = k_ptr + k_size; + } else { + int64_t k_size = q_size; + k_ptr = q_ptr + q_size; + v_ptr = k_ptr + k_size; + } // q*k^t, batched_gemm CBLAS_TRANSPOSE transA = CblasNoTrans; @@ -102,7 +123,7 @@ class FMHARef { auto blas = phi::funcs::GetBlas(dev_ctx_); int gemm_batch_size = batch_size_ * num_head_; int gemm_m = seq_len_; - int gemm_n = seq_len_; + int gemm_n = out_seq_len; int gemm_k = head_dim_; T alpha = static_cast(1.0 / sqrt(head_dim_)); T beta = static_cast(0.0); @@ -133,16 +154,16 @@ class FMHARef { transB = CblasNoTrans; gemm_m = seq_len_; gemm_n = head_dim_; - gemm_k = seq_len_; + gemm_k = out_seq_len; alpha = static_cast(1.0); stride_a = gemm_m * gemm_k; stride_b = gemm_k * gemm_n; if (dropout_param_.dropout_prob_) { DropoutFwGPUKernelDriver( - dev_ctx_, dropout_param_.is_test_, - static_cast( - dropout_param_.dropout_implementation_), + static_cast(dev_ctx_), + dropout_param_.is_test_, static_cast( + dropout_param_.dropout_implementation_), dropout_param_.dropout_prob_, dropout_param_.is_upscale_in_train_, dropout_param_.is_fix_seed_, dropout_param_.seed_val_, static_cast(*softmax_out_tensor), dropout_param_.seed_, @@ -242,8 +263,9 @@ class FMHARef { // dropout bw if (dropout_param_.dropout_prob_) { DropoutGradGPUKernelDriver( - dev_ctx_, static_cast( - dropout_param_.dropout_implementation_), + static_cast(dev_ctx_), + static_cast( + dropout_param_.dropout_implementation_), dropout_param_.dropout_prob_, static_cast(*dropout_out_grad_tensor), dropout_mask_out_tensor, softmax_out_grad_tensor->numel(), diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc index d141800d61c0ec0b73fe2cc3c8d00dbf1de44cf2..e473f8ff0662cfc3fd7bdc5010bfa1dc08fba85f 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cc +++ b/paddle/fluid/operators/fused/fused_attention_op.cc @@ -61,6 +61,10 @@ class FusedAttentionOp : public framework::OperatorWithKernel { OP_INOUT_CHECK(ctx->HasOutput("QKTVOut"), "Output", "QKTVOut", "FusedAttentionOp"); + if (ctx->HasInput("CacheKV")) { + OP_INOUT_CHECK(ctx->HasOutput("CacheKVOut"), "Output", "CacheKVOut", + "FusedAttentionOp"); + } if (ctx->HasInput("SrcMask")) { OP_INOUT_CHECK(ctx->HasOutput("SrcMaskOut"), "Output", "SrcMaskOut", "FusedAttentionOp"); @@ -105,12 +109,14 @@ class FusedAttentionOp : public framework::OperatorWithKernel { "input qkv_weight = [%s]", x_dim, y_dim)); - PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], y_dim[3], - platform::errors::InvalidArgument( - "The dimensions of qkv_weight must be 4" - "(3, num_head, dim_head, dim_embed)," - "and must satisfy the limitations: " - "(num_head * dim_head == dim_embed)")); + if (ctx->Attrs().Get("ring_id") == -1) { + PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], y_dim[3], + platform::errors::InvalidArgument( + "The dimensions of qkv_weight must be 4" + "(3, num_head, dim_head, dim_embed)," + "and must satisfy the limitations: " + "(num_head * dim_head == dim_embed)")); + } if (ctx->Attrs().Get("pre_layer_norm") == true) { ctx->SetOutputDim("LnMean", {x_dim[0] * x_dim[1]}); @@ -132,20 +138,64 @@ class FusedAttentionOp : public framework::OperatorWithKernel { // [3, batch_size, num_head, seq_len, head_size] ctx->SetOutputDim("TransposeOut2", {y_dim[0], x_dim[0], y_dim[1], x_dim[1], y_dim[2]}); - // [batch, num_head, seq_len, seq_len] - ctx->SetOutputDim("QKOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + + // cache_seq_len + seq_len if cache else seq_len + auto out_seq_len = x_dim[1]; + if (ctx->HasInput("CacheKV")) { + // [2, batch_size, num_head, cache_seq_len, head_size] + auto c_dim = ctx->GetInputDim("CacheKV"); + + PADDLE_ENFORCE_EQ( + c_dim.size(), 5, + paddle::platform::errors::InvalidArgument( + "The CacheKV must be 5 dims, but got %d", c_dim.size())); + PADDLE_ENFORCE_EQ(c_dim[0], 2, + paddle::platform::errors::InvalidArgument( + "The first dim of CacheKV must be 2, but got %d", + c_dim[0])); // 2 + PADDLE_ENFORCE_EQ(c_dim[1], x_dim[0], + paddle::platform::errors::InvalidArgument( + "The second dim of CacheKV must be equal with " + "batch size %d, but got %d", + x_dim[0], c_dim[1])); // batch_size + PADDLE_ENFORCE_EQ(c_dim[2], y_dim[1], + paddle::platform::errors::InvalidArgument( + "The third dim of CacheKV must be equal with num " + "head %d, but got %d", + y_dim[1], c_dim[2])); // num_head + PADDLE_ENFORCE_GE( + c_dim[3], 0, + paddle::platform::errors::InvalidArgument( + "The forth dim of CacheKV must be greater than 0, but got %d", + c_dim[3])); // cache_seq_len + PADDLE_ENFORCE_EQ(c_dim[4], y_dim[2], + paddle::platform::errors::InvalidArgument( + "The fifth dim of CacheKV must be equal with head " + "size %d, but got %d", + y_dim[2], c_dim[4])); // head_size + + out_seq_len += c_dim[3]; + // [3, batch_size, num_head, cache_seq_len + seq_len, head_size] + ctx->SetOutputDim("CacheKVOut", + {c_dim[0], c_dim[1], c_dim[2], out_seq_len, c_dim[4]}); + } + + // [batch, num_head, seq_len, out_seq_len] + ctx->SetOutputDim("QKOut", {x_dim[0], y_dim[1], x_dim[1], out_seq_len}); if (ctx->HasInput("SrcMask")) { - ctx->SetOutputDim("SrcMaskOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + ctx->SetOutputDim("SrcMaskOut", + {x_dim[0], y_dim[1], x_dim[1], out_seq_len}); } // the same as QKOut's shape. ctx->SetOutputDim("AttnDropoutOut", - {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + {x_dim[0], y_dim[1], x_dim[1], out_seq_len}); if (ctx->Attrs().Get("attn_dropout_is_test") == false) { ctx->SetOutputDim("AttnDropoutMaskOut", - {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + {x_dim[0], y_dim[1], x_dim[1], out_seq_len}); } - ctx->SetOutputDim("SoftmaxOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + ctx->SetOutputDim("SoftmaxOut", + {x_dim[0], y_dim[1], x_dim[1], out_seq_len}); // [batch_size, num_heads, seq_len, head_dim] ctx->SetOutputDim("QKTVOut", {x_dim[0], y_dim[1], x_dim[1], y_dim[2]}); // [batch_size, seq_len, number of heads*head size] @@ -182,6 +232,8 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker { .AsDispensable(); AddInput("QKVW", "The qkv weight tensor."); AddInput("QKVBias", "The qkv bias tensor.").AsDispensable(); + AddInput("CacheKV", "(optional) The cached KV for generation inference.") + .AsDispensable(); AddInput("SrcMask", "(optional) The attention mask tensor in fmha.") .AsDispensable(); AddInput("OutLinearW", "The out_linear weight tensor."); @@ -217,6 +269,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("BiasDropoutResidualOut", "Result of residual + dropout(src + bias).") .AsIntermediate(); + AddOutput("CacheKVOut", "The udpated cache KV."); AddOutput("Y", "Result after attention."); AddAttr("pre_layer_norm", @@ -324,6 +377,10 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker { "0.0 and 0.001, But received [%s].", ln_epsilon)); }); + AddAttr( + "ring_id", + "ring id for tensor model parallel. distributed training and inference") + .SetDefault(-1); AddComment(R"DOC( Add fused attention op whose logic is as follows: diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu index 03f51fc5857985902c21ad12fefbdc9cdec6ef04..d26577f06fe683fb1528c61b4401b9e578c90c9f 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_attention_op.cu @@ -27,11 +27,39 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fmha_ref.h" #include "paddle/fluid/operators/fused/fused_dropout_helper.h" +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#endif + namespace paddle { namespace operators { using Tensor = framework::Tensor; +template +static void AllReduce(framework::Tensor &tensor, // NOLINT + const int ring_id, + const platform::CUDADeviceContext &ctx) { + if (ring_id == -1) return; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto dtype = + platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype())); + int64_t numel = tensor.numel(); + const void *sendbuff = tensor.data(); + auto place = ctx.GetPlace(); + void *recvbuff = tensor.mutable_data(place); + auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); + auto stream = ctx.stream(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "PaddlePaddle should compile with NCCL or RCCL when used tensor model " + "parallel op.")); +#endif +} + template class FusedAttentionOpKernel : public framework::OpKernel { public: @@ -56,6 +84,8 @@ class FusedAttentionOpKernel : public framework::OpKernel { auto *src_mask = ctx.Input("SrcMask"); auto *transpose_out_2 = ctx.Output("TransposeOut2"); + auto *cache_kv = ctx.Input("CacheKV"); + auto *cache_kv_out = ctx.Output("CacheKVOut"); auto *qk_out = ctx.Output("QKOut"); auto *qktv_out = ctx.Output("QKTVOut"); auto *softmax_out = ctx.Output("SoftmaxOut"); @@ -86,6 +116,7 @@ class FusedAttentionOpKernel : public framework::OpKernel { auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input("Seed1") : nullptr; bool is_fix_seed_1 = ctx.Attr("attn_dropout_fix_seed"); int seed_val_1 = ctx.Attr("attn_dropout_seed"); + int ring_id = ctx.Attr("ring_id"); // final output. auto *out = ctx.Output("Y"); @@ -105,6 +136,10 @@ class FusedAttentionOpKernel : public framework::OpKernel { // get data ptr for FMHA. auto *transpose_out_2_data = transpose_out_2->mutable_data(ctx.GetPlace()); + auto *cache_kv_out_data = + (cache_kv_out == nullptr) + ? nullptr + : cache_kv_out->mutable_data(ctx.GetPlace()); auto *qk_out_data = qk_out->mutable_data(ctx.GetPlace()); auto *qktv_out_data = qktv_out->mutable_data(ctx.GetPlace()); auto *src_mask_out_data = @@ -161,9 +196,14 @@ class FusedAttentionOpKernel : public framework::OpKernel { output_size = hidden_size; // (transA, transB, compute_bias) = (false, false, false) + // NOTE(Yuang Liu): For general input size == output size, change the + // position won't have effects. For mp, the output size is mp_head * dkey + // which is actually the input size. While the input size is hidden size, + // which is actually the output size. So for out linear, switch the + // input size and output size. auto out_linear_compute = AttnMatMul(ctx.cuda_device_context(), false, false, bsz_seq, - output_size, input_size, false); + input_size, output_size, false); DropoutParam dropout_param2(ctx, 0); FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2, @@ -186,15 +226,15 @@ class FusedAttentionOpKernel : public framework::OpKernel { qkv_bias_out); } if (qkv_bias == nullptr) { - fmha_ref_compute.ComputeForward(*qkv_out, src_mask, transpose_out_2, - qk_out, src_mask_out, softmax_out, - attn_dropout_mask_out, attn_dropout_out, - qktv_out, fmha_out); + fmha_ref_compute.ComputeForward( + *qkv_out, cache_kv, src_mask, transpose_out_2, cache_kv_out, qk_out, + src_mask_out, softmax_out, attn_dropout_mask_out, attn_dropout_out, + qktv_out, fmha_out); } else { - fmha_ref_compute.ComputeForward(*qkv_bias_out, src_mask, transpose_out_2, - qk_out, src_mask_out, softmax_out, - attn_dropout_mask_out, attn_dropout_out, - qktv_out, fmha_out); + fmha_ref_compute.ComputeForward( + *qkv_bias_out, cache_kv, src_mask, transpose_out_2, cache_kv_out, + qk_out, src_mask_out, softmax_out, attn_dropout_mask_out, + attn_dropout_out, qktv_out, fmha_out); } // fmha_out: [batch_size, seq_len, num_head, head_dim] @@ -202,6 +242,9 @@ class FusedAttentionOpKernel : public framework::OpKernel { // out_linear_out: [batch_size, seq_len, embed_dim] out_linear_compute.ComputeForward(out_linear_weight, fmha_out, nullptr, out_linear_out, nullptr); + // tensor model parallel + AllReduce(*out_linear_out, ring_id, ctx.cuda_device_context()); + if (pre_layer_norm) { // output = (residual + dropout(input + bias)) fused_dropout_layernorm_helper.ResidualDropoutBias( @@ -244,6 +287,7 @@ class FusedAttentionGradKernel : public framework::OpKernel { auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input("Seed1") : nullptr; bool is_fix_seed_1 = ctx.Attr("attn_dropout_fix_seed"); int seed_val_1 = ctx.Attr("attn_dropout_seed"); + int ring_id = ctx.Attr("ring_id"); // get inputs. auto *d_y = ctx.Input(framework::GradVarName("Y")); @@ -399,9 +443,10 @@ class FusedAttentionGradKernel : public framework::OpKernel { transA = false; transB = false; bool compute_bias = false; + // (b*s, num_head * dim_head) * (num_head * dim_head, dim_embed) auto out_linear_compute = AttnMatMul(ctx.cuda_device_context(), transA, transB, bsz_seq, - output_size, input_size, compute_bias); + input_size, output_size, compute_bias); DropoutParam dropout_param2(ctx, 0); FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2, @@ -475,6 +520,8 @@ class FusedAttentionGradKernel : public framework::OpKernel { qkv_compute.ComputeBackward(ln_out, qkv_weight, d_qkv_out, d_ln_out, d_qkv_weight, d_qkv_bias); } + // tensor model parallel + AllReduce(*d_ln_out, ring_id, ctx.cuda_device_context()); layer_norm_compute.ComputeBackward(x_data, d_ln_out_data, ln_scale_data, ln_mean_data, ln_var_data, d_x_data, d_ln_scale_data, d_ln_bias_data); @@ -486,6 +533,8 @@ class FusedAttentionGradKernel : public framework::OpKernel { qkv_compute.ComputeBackward(input_x, qkv_weight, d_qkv_out, d_x, d_qkv_weight, d_qkv_bias); } + // tensor model parallel + AllReduce(*d_x, ring_id, ctx.cuda_device_context()); } // gradient accumulation std::vector ins; diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias.h b/paddle/fluid/operators/fused/fused_dropout_act_bias.h index 994601a2f0608b4fc04966c7549c421f395f3ec7..9f5a1bad047b44b715e11e74d92fdca1982c96f8 100755 --- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h +++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h @@ -130,17 +130,17 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout, const T factor, const int64_t size, T *dx) { int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) { LoadT dout_vec; LoadT src_vec; MaskLoadT mask_vec; - platform::Load(&dout[i], &dout_vec); - platform::Load(&mask[i], &mask_vec); - platform::Load(&src[i], &src_vec); + phi::Load(&dout[i], &dout_vec); + phi::Load(&mask[i], &mask_vec); + phi::Load(&src[i], &src_vec); StoreT dx_vec; #pragma unroll @@ -148,7 +148,7 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout, T tmp = dout_vec[ii] * static_cast(mask_vec[ii]) * factor; dx_vec[ii] = tmp * act_grad.UseOut(src_vec[ii]); } - platform::Store(dx_vec, &dx[i]); + phi::Store(dx_vec, &dx[i]); } } @@ -167,9 +167,9 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout, T *dx, T *dbias) { int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x; - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; T tmp_sum[VecSize] = {static_cast(0)}; // calculate the dx and temporary sum if (col_id * VecSize < cols) { @@ -180,10 +180,10 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout, LoadT bias_vec; MaskLoadT mask_vec; - platform::Load(&dout[index], &dout_vec); - platform::Load(&src[index], &src_vec); - platform::Load(&mask[index], &mask_vec); - platform::Load(&bias[col_id * VecSize], &bias_vec); + phi::Load(&dout[index], &dout_vec); + phi::Load(&src[index], &src_vec); + phi::Load(&mask[index], &mask_vec); + phi::Load(&bias[col_id * VecSize], &bias_vec); StoreT dx_vec; #pragma unroll @@ -194,7 +194,7 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout, dx_vec[i] = val; tmp_sum[i] += val; } - platform::Store(dx_vec, &dx[index]); + phi::Store(dx_vec, &dx[index]); } } diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu index 2381b5b7fdfb85cbaa3fd66a10c5b630bb515f15..717c1732b7b3acf8528887aae43471c0dc0716e3 100644 --- a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu @@ -20,8 +20,14 @@ limitations under the License. */ #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/fused/fused_dropout_act_bias.h" #include "paddle/fluid/operators/fused/fused_dropout_test.h" +#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/functors.h" +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT); +#endif + namespace framework = paddle::framework; namespace platform = paddle::platform; namespace details = paddle::operators::details; diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h index f79277e4e8f0d22cedafc9f7b40b56ecd2d6a817..6bf3a7114f4ced3c7c6ecd1f1afeca60ff66528f 100644 --- a/paddle/fluid/operators/fused/fused_dropout_common.h +++ b/paddle/fluid/operators/fused/fused_dropout_common.h @@ -21,11 +21,11 @@ limitations under the License. */ #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h" -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/functors.h" namespace paddle { diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h index d7952df470d81566c3833e79e8cfa31a7d2dc68c..18c7187fc8e64c9fed8a86a984954b5420c1e5b5 100644 --- a/paddle/fluid/operators/fused/fused_dropout_test.h +++ b/paddle/fluid/operators/fused/fused_dropout_test.h @@ -31,7 +31,7 @@ namespace framework = paddle::framework; namespace platform = paddle::platform; namespace memory = paddle::memory; -USE_OP(dropout); +USE_OP_ITSELF(dropout); USE_OP(layer_norm); template diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc index 56c2c86e1a70d64d4f96e10bbdd353dab4b7e932..7308f30779248e64f55e10b0661d2c98d263416c 100644 --- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc @@ -14,10 +14,10 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h" #include -#include "paddle/fluid/operators/math/cpu_vec.h" -#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/cpu_vec.h" +#include "paddle/phi/kernels/funcs/sequence2batch.h" namespace paddle { namespace operators { @@ -243,12 +243,12 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { auto& act_cell_str = ctx.Attr("cell_activation"); \ auto& act_cand_str = ctx.Attr("candidate_activation"); \ if (platform::MayIUse(platform::avx)) { \ - math::VecActivations act_functor; \ + phi::funcs::VecActivations act_functor; \ act_gate = act_functor(act_gate_str); \ act_cell = act_functor(act_cell_str); \ act_cand = act_functor(act_cand_str); \ } else { \ - math::VecActivations act_functor; \ + phi::funcs::VecActivations act_functor; \ act_gate = act_functor(act_gate_str); \ act_cell = act_functor(act_cell_str); \ act_cand = act_functor(act_cand_str); \ @@ -473,7 +473,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { hidden_out->mutable_data(place); cell_out->mutable_data(place); - math::LoDTensor2BatchFunctor to_batch; + phi::funcs::LoDTensor2BatchFunctor to_batch; auto& dev_ctx = ctx.template device_context(); auto blas = phi::funcs::GetBlas(dev_ctx); @@ -591,7 +591,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { #undef MOVE_ONE_BATCH #undef DEFINE_CUR - math::Batch2LoDTensorFunctor to_seq; + phi::funcs::Batch2LoDTensorFunctor to_seq; batched_h_out->set_lod(batched_lod); to_seq(dev_ctx, *batched_h_out, hidden_out); batched_c_out->set_lod(batched_lod); diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc index 0c8eae4260441f6c873b48735a01b967b70ef4bb..f3f8f1742757783a082437638f67407700963eb1 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cc +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc @@ -195,6 +195,8 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(false); AddAttr("dropout1_seed", "Dropout1 random seed.").SetDefault(0); AddAttr("dropout2_seed", "Dropout2 random seed.").SetDefault(0); + AddAttr("ring_id", "ring id for tensor model parallel.") + .SetDefault(-1); AddComment(R"DOC( the function of fused_feedforward operator is the same as the following pseudo code: residual = src; diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu index 3131269955bdd17a0552836121589d8edeb4a38e..c38d9f7d4bcbd25b3111b35a918de0f4ebdabefb 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cu +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu @@ -21,11 +21,39 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_dropout_helper.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h" +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#endif + namespace paddle { namespace operators { using Tensor = framework::Tensor; +template +static void AllReduce(framework::Tensor& tensor, // NOLINT + const int ring_id, + const platform::CUDADeviceContext& ctx) { + if (ring_id == -1) return; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto dtype = + platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype())); + int64_t numel = tensor.numel(); + const void* sendbuff = tensor.data(); + auto place = ctx.GetPlace(); + void* recvbuff = tensor.mutable_data(place); + auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); + auto stream = ctx.stream(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "PaddlePaddle should compile with NCCL or RCCL when used tensor model " + "parallel op.")); +#endif +} + template class FusedFeedForwardKernel : public framework::OpKernel { public: @@ -56,7 +84,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { framework::Tensor* dropout1_out, framework::Tensor* dropout2_out, const int bsz_seq, const int d_model, const int dim_feedforward, const std::string& act_method, const bool pre_layer_norm, - const float epsilon1, const float epsilon2, + const float epsilon1, const float epsilon2, const int ring_id, const DropoutParam& dropout_param1, const DropoutParam& dropout_param2, const platform::CUDADeviceContext& ctx) const { @@ -95,6 +123,10 @@ class FusedFeedForwardKernel : public framework::OpKernel { framework::Tensor linear2_out; linear2_out.mutable_data({bsz_seq, d_model}, place); MatMul(ctx, *dropout1_out, linear2_weight, &linear2_out); + + // tensor model parallel + AllReduce(linear2_out, ring_id, ctx); + if (!pre_layer_norm) { fused_dropout_layernorm_helper.LayernormResidualDropoutBias( ctx, linear2_out.data(), x.data(), linear2_bias_ptr, @@ -150,6 +182,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { const float epsilon1 = context.Attr("ln1_epsilon"); const float epsilon2 = context.Attr("ln2_epsilon"); + const int ring_id = context.Attr("ring_id"); DropoutParam dropout_param1(context, 1); DropoutParam dropout_param2(context, 2); @@ -186,7 +219,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { dropout2_mask, ln1_mean, ln1_variance, ln2_mean, ln2_variance, linear1_out, ln1_out, dropout1_out, dropout2_out, bsz_seq, d_model, dim_feedforward, act_method, pre_layer_norm, epsilon1, epsilon2, - dropout_param1, dropout_param2, context.cuda_device_context()); + ring_id, dropout_param1, dropout_param2, context.cuda_device_context()); } }; @@ -231,7 +264,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { const int dim_feedforward, const DropoutParam& dropout_param1, const DropoutParam& dropout_param2, const std::string& act_method, const bool pre_layer_norm, const float epsilon1, const float epsilon2, - const platform::CUDADeviceContext& ctx) const { + const int ring_id, const platform::CUDADeviceContext& ctx) const { FusedDropoutLayerNormHelper pre_layernorm_helper( bsz_seq, d_model, epsilon1); FusedDropoutHelper fused_act_dropout_helper( @@ -295,13 +328,16 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { d_ln1_out.mutable_data({bsz_seq, d_model}, place); MatMulGrad(ctx, d_linear1_out, *ln1_out, linear1_weight, &d_ln1_out, d_linear1_weight); - + // tensor model parallel + AllReduce(d_ln1_out, ring_id, ctx); pre_layernorm_helper.LayerNormGrad( ctx, d_ln1_out.data(), x.data(), ln1_gamma_ptr, ln1_mean->data(), ln1_variance->data(), d_x->data(), d_ln1_gamma_ptr, d_ln1_beta_ptr); } else { MatMulGrad(ctx, d_linear1_out, x, linear1_weight, d_x, d_linear1_weight); + // tensor model parallel + AllReduce(*d_x, ring_id, ctx); } std::vector ins(2); std::vector outs(1); @@ -376,6 +412,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { const float epsilon1 = context.Attr("ln1_epsilon"); const float epsilon2 = context.Attr("ln2_epsilon"); + const int ring_id = context.Attr("ring_id"); const std::string act_method = context.Attr("act_method"); DropoutParam dropout_param1(context, 1); DropoutParam dropout_param2(context, 2); @@ -419,7 +456,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { d_linear1_bias, d_linear2_weight, d_linear2_bias, d_ln1_scale, d_ln1_bias, d_ln2_scale, d_ln2_bias, bsz_seq, d_model, dim_feedforward, dropout_param1, dropout_param2, act_method, - pre_layer_norm, epsilon1, epsilon2, context.cuda_device_context()); + pre_layer_norm, epsilon1, epsilon2, ring_id, + context.cuda_device_context()); } }; } // namespace operators diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4c4e3661e6d6edc5ea95b77cd283cc99afcca8ed --- /dev/null +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc @@ -0,0 +1,353 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +class FusedGemmEpilogueOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGemmEpilogueOp"); + OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "FusedGemmEpilogueOp"); + OP_INOUT_CHECK(ctx->HasInput("Bias"), "Output", "Bias", + "FusedGemmEpilogueOp"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", + "FusedGemmEpilogueOp"); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto bias_dims = ctx->GetInputDim("Bias"); + + auto trans_x = ctx->Attrs().Get("trans_x"); + auto trans_y = ctx->Attrs().Get("trans_y"); + + PADDLE_ENFORCE_EQ( + y_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor Y's dimension of FusedGemmEpilogueOp " + " should be 2, but got %d.", + y_dims.size())); + + PADDLE_ENFORCE_GE( + x_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor X's dimension of FusedGemmEpilogueOp " + " should be >= 2, but got %d.", + x_dims.size())); + + PADDLE_ENFORCE_EQ( + bias_dims.size(), 1, + platform::errors::InvalidArgument( + "The Input tensor bias's dimension of FusedGemmEpilogueOp " + " should be == 1, but got %d.", + bias_dims.size())); + + PADDLE_ENFORCE_EQ(bias_dims[0], trans_y ? y_dims[0] : y_dims[1], + platform::errors::InvalidArgument( + "The Input tensor bias's dimension 0" + " should be == Y[-1], but got bias's shape = [%s] " + "and Y's shape = [%s]", + bias_dims, y_dims)); + + auto x_mat_dims = + phi::flatten_to_2d(x_dims, trans_x ? 1 : x_dims.size() - 1); + + int K_from_x = trans_x ? x_mat_dims[0] : x_mat_dims[1]; + int K_from_y = trans_y ? y_dims[1] : y_dims[0]; + + PADDLE_ENFORCE_EQ( + K_from_x, K_from_y, + platform::errors::InvalidArgument( + "The last dimension of X should be equal with Y's first dimension." + "But received X[-1] = [%d], Y[0] = [%d].", + K_from_x, K_from_y)); + + auto activation = ctx->Attrs().Get("activation"); + + if ((activation != "relu") && (activation != "gelu") && + (activation != "none")) { + PADDLE_ENFORCE_EQ( + true, false, + platform::errors::InvalidArgument( + "The activation attribute of fused_gemm_epilogue op should be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation=%s.", + activation)); + } + + if (activation == "none" && ctx->HasOutput("ReserveSpace")) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The ReserveSpace would not be used when activation = \"none\"")); + } + + // cublasLt's restriction for auxiliary. + if (ctx->HasOutput("ReserveSpace") && activation != "none") { + int min_size_of_n = activation == "relu" ? 128 : 8; + int N_size = trans_y ? y_dims[0] : y_dims[1]; + PADDLE_ENFORCE_EQ(N_size % min_size_of_n, 0, + platform::errors::InvalidArgument( + "The output dimension N (X(MxK) * Y(KxN) = C(MxN)) " + "should be multiple of %d when auxiliary_key given " + "and activation=%s, but got N = %d.", + min_size_of_n, activation, N_size)); + } + + std::vector out_dims; + out_dims.reserve(static_cast(x_dims.size())); + if (trans_x) { + for (int i = 1; i < x_dims.size(); ++i) out_dims.push_back(x_dims[i]); + } else { + for (int i = 0; i < x_dims.size() - 1; ++i) out_dims.push_back(x_dims[i]); + } + + if (trans_y) { + out_dims.push_back(y_dims[0]); + } else { + out_dims.push_back(y_dims[1]); + } + + ctx->SetOutputDim("Out", phi::make_ddim(out_dims)); + // Note (Ming Huang): Reserve space of relu is a bit-mask, + // which cannot pass nan_and_inf checking if shape is set. + if (activation == "gelu" && ctx->HasOutput("ReserveSpace")) { + ctx->SetOutputDim("ReserveSpace", phi::make_ddim(out_dims)); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library); + } +}; + +class FusedGemmEpilogueOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input tensor X of Out = Act((X * Y) + Bias)."); + AddInput("Y", "The input tensor Y of Out = Act((X * Y) + Bias)."); + AddInput("Bias", "The input tensor bias of Out = Act((X * Y) + Bias)."); + + AddOutput("Out", "The output tensor Out of Out = Act((X * Y) + Bias)."); + AddOutput("ReserveSpace", + R"DOC(Reserve GPU space to place + auxiliary data pointer. It is used to pass auxiliary data pointer + for fused_gemm_epilogue op. If not given (empty string), the + auxiliary mode would not be enable.)DOC") + .AsDispensable() + .AsExtra(); + + AddAttr( + "trans_x", + R"DOC((bool, default false), Whether to transpose input tensor X + or not. The input tensor X coulbe be more than two dimension. When + set trans_x=true, it would fully reverse X. For instant: X with shpae + [d0, d1, d2, d3] -> [d3, d2, d1, d0].)DOC") + .SetDefault(false); + AddAttr( + "trans_y", + R"DOC((bool, default false), Whether to transpose input tensor Y + or not. The input tensor Y should be two dimension. When + set trans_y=true, it would transpose Y. For instant: Y with shpae + [d0, d1] -> [d1, d0].)DOC") + .SetDefault(false); + + AddAttr( + "activation", + R"DOC((string, default none), The activation function. It could be + one of {none, relu, gelu}. When none is given, Act would be null + operations)DOC") + .SetDefault("none"); + + AddComment(R"DOC( +FusedGemmEpilogue Operator +This operator is used to perform Activeation(Elementwise_add(Matmul(X, Y), bias)). +It is equal to paddle.nn.Linear + Activation (None, ReLU or GeLU). + +Note: +X could be more than two dimension and would be flatten to 2D for computing. +X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3] +)DOC"); + } +}; + +class FusedGemmEpilogueGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("DOut"), "Input", "DOut", + "FusedGemmEpilogueGradOp"); + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGemmEpilogueGradOp"); + OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "FusedGemmEpilogueGradOp"); + OP_INOUT_CHECK(ctx->HasOutput("DY"), "Output", "DY", "FusedGemmEpilogueOp"); + + auto dout_dims = ctx->GetInputDim("DOut"); + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_GE( + dout_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor DOut's dimension of FusedGemmEpilogueGradOp " + " should be >= 2, but got %d.", + dout_dims.size())); + + PADDLE_ENFORCE_EQ( + y_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor Y's dimension of FusedGemmEpilogueGradOp " + " should be 2, but got %d.", + y_dims.size())); + + PADDLE_ENFORCE_GE( + x_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor X's dimension of FusedGemmEpilogueGradOp " + " should be >= 2, but got %d.", + x_dims.size())); + + PADDLE_ENFORCE_EQ( + dout_dims.size(), x_dims.size(), + platform::errors::InvalidArgument( + "The Input tensor DOut's and X's dimension of " + "FusedGemmEpilogueGradOp " + " should be the same, but got DOut's dim = %d and X's = %d.", + dout_dims.size(), x_dims.size())); + + auto dout_mat_dims = phi::flatten_to_2d(dout_dims, dout_dims.size() - 1); + + auto x_mat_dims = phi::flatten_to_2d(x_dims, x_dims.size() - 1); + + PADDLE_ENFORCE_EQ( + dout_mat_dims[1], y_dims[1], + platform::errors::InvalidArgument( + "The last dimension of DOut should be equal with Y's last" + "dimension. But received DOut[-1] = [%d], Y[1] = [%d].", + dout_mat_dims[1], y_dims[1])); + + PADDLE_ENFORCE_EQ( + dout_mat_dims[0], x_mat_dims[0], + platform::errors::InvalidArgument( + "The first dimension of DOut should be equal with X's first" + "dimension. But received DOut[0] = [%d], Y[0] = [%d].", + dout_mat_dims[0], x_mat_dims[0])); + + auto activation_grad = ctx->Attrs().Get("activation_grad"); + if ((activation_grad != "relu_grad") && (activation_grad != "gelu_grad") && + (activation_grad != "none")) { + PADDLE_ENFORCE_EQ( + true, false, + platform::errors::InvalidArgument( + "The activation attribute of fused_gemm_epilogue op should be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation=%s.", + activation_grad)); + } + + if (activation_grad != "none" && !ctx->HasInput("ReserveSpace")) { + PADDLE_ENFORCE_EQ(true, false, + platform::errors::InvalidArgument( + "The ReserveSpace should not be empty. " + "when activation_grad == {relu_grad, gelu_grad}.")); + } + + if (ctx->HasOutput("DX")) { + std::vector dx_dims; + dx_dims.reserve(static_cast(x_dims.size())); + for (int i = 0; i < x_dims.size(); ++i) { + dx_dims.push_back(x_dims[i]); + } + ctx->SetOutputDim("DX", phi::make_ddim(dx_dims)); + } + + std::vector dy_dims(y_dims.Get(), y_dims.Get() + y_dims.size()); + ctx->SetOutputDim("DY", phi::make_ddim(dy_dims)); + + if (ctx->HasOutput("DBias")) { + std::vector dbias_dims; + dbias_dims.push_back(y_dims[1]); + ctx->SetOutputDim("DBias", phi::make_ddim(dbias_dims)); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DOut"); + return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library); + } +}; + +class FusedGemmEpilogueGradOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("DOut", + "The input grad tensor to Out of Out = (Act(X) * Y) + bias"); + AddInput("X", "The input tensor X of Out = (Act(X) * Y) + bias"); + AddInput("Y", "The input tensor Y of Out = (Act(X) * Y) + bias"); + AddInput("ReserveSpace", + R"DOC(A GPU space to fetch + auxiliary data pointer. It is used to pass auxiliary data pointer + for fused_gemm_epilogue_grad op. If not given (empty string), the + auxiliary mode would not be enable.)DOC") + .AsDispensable(); + + AddOutput("DX", "The output grad tensor to X of Out = (Act(X) * Y) + bias.") + .AsDispensable(); + AddOutput("DY", + "The output grad tensor to Y of Out = (Act(X) * Y) + bias."); + AddOutput("DBias", + "The output grad tensor to bias of Out = (Act(X) * Y) + bias.") + .AsDispensable(); + + AddAttr( + "activation_grad", + R"DOC((string, default none), The backward activation function. It could be + one of {none, relu_grad, gelu_grad}. When none is given, The backward Act would + be null operations)DOC") + .SetDefault("none"); + + AddComment(R"DOC( +FusedGemmEpilogueGrad Operator +This operator is used to perform backward of Elementwise_add(Matmul(Activeation(X), Y), bias). +It is equal to Activation (None, ReLU or GeLU) + paddle.nn.Linear. + +Note: +X could be more than two dimension and would be flatten to 2D for computing. +X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3] +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fused_gemm_epilogue, ops::FusedGemmEpilogueOp, + ops::FusedGemmEpilogueOpMaker) +REGISTER_OPERATOR(fused_gemm_epilogue_grad, ops::FusedGemmEpilogueGradOp, + ops::FusedGemmEpilogueGradOpMaker) diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..e16c9e8f483ccc2cbf1d7006159cccfe906dd06b --- /dev/null +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu @@ -0,0 +1,376 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/dynload/cublasLt.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class FusedGemmEpilogueKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + + const Tensor* x = ctx.Input("X"); + const Tensor* y = ctx.Input("Y"); + const Tensor* bias = ctx.Input("Bias"); + + Tensor* out = ctx.Output("Out"); + Tensor* reserve_space = ctx.Output("ReserveSpace"); + + bool trans_x = ctx.Attr("trans_x"); + bool trans_y = ctx.Attr("trans_y"); + + std::string activation = ctx.Attr("activation"); + bool enable_auxiliary = reserve_space == nullptr ? false : true; + + out->mutable_data(ctx.GetPlace()); + auto* out_data = out->data(); + + auto x_mat_dims = + phi::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1); + int64_t M = trans_x ? x_mat_dims[1] : x_mat_dims[0]; + int64_t K = trans_y ? y->dims()[1] : y->dims()[0]; + int64_t N = trans_y ? y->dims()[0] : y->dims()[1]; + + cudaDataType_t mat_type = CUDA_R_32F; + cudaDataType_t scale_type = CUDA_R_32F; + cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; + if (std::is_same::value) { + mat_type = CUDA_R_16F; + scale_type = CUDA_R_16F; + } + if (std::is_same::value) { + mat_type = CUDA_R_64F; + scale_type = CUDA_R_64F; + compute_type = CUBLAS_COMPUTE_64F; + } + + cublasLtMatmulDesc_t operation_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate( + &operation_desc, compute_type, scale_type)); + cublasOperation_t transx = trans_x ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t transy = trans_y ? CUBLAS_OP_T : CUBLAS_OP_N; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transx, + sizeof(transx))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transy, + sizeof(transy))); + + cublasLtEpilogue_t epiloque_func = + get_epilogue_type_(activation, enable_auxiliary); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epiloque_func, + sizeof(epiloque_func))); + const T* bias_data = bias->data(); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_data, + sizeof(bias_data))); + + if (enable_auxiliary && activation != "none") { + size_t reserve_space_size = 0; + if (activation == "relu") { + // Count in bits. + reserve_space_size = phi::product(out->dims()) / 8; + } else { + reserve_space_size = phi::product(out->dims()) * sizeof(T); + } + reserve_space->mutable_data(ctx.GetPlace(), out->type(), + reserve_space_size); + void* aux_data = reinterpret_cast(reserve_space->data()); + + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, + &aux_data, sizeof(aux_data))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N, + sizeof(N))); + } + + cublasLtMatrixLayout_t x_desc = NULL, y_desc = NULL, out_desc = NULL; + if (trans_x) + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &x_desc, mat_type, M, K, M)); + else + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &x_desc, mat_type, K, M, K)); + if (trans_y) + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &y_desc, mat_type, K, N, K)); + else + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &y_desc, mat_type, N, K, N)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &out_desc, mat_type, N, M, N)); + + cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle(); + size_t workspace_size = 4 * 1024 * 1024; + const cublasLtMatmulAlgo_t* algo = nullptr; + cudaStream_t stream = dev_ctx.stream(); + memory::allocation::AllocationPtr workspace = + memory::Alloc(dev_ctx, workspace_size); + + double alpha64 = 1.0, beta64 = 0.0; + float alpha32 = 1.0f, beta32 = 0.0f; + void *alpha = nullptr, *beta = nullptr; + if (std::is_same::value) { + alpha = &alpha64; + beta = &beta64; + } else { + alpha = &alpha32; + beta = &beta32; + } + + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul( + lt_handle, operation_desc, alpha, y->data(), y_desc, x->data(), + x_desc, beta, out_data, out_desc, out_data, out_desc, algo, + workspace->ptr(), workspace_size, stream)); + } + + private: + static cublasLtEpilogue_t get_epilogue_type_(const std::string& activation, + bool enable_auxiliary) { + if (activation == "relu") { + return enable_auxiliary ? CUBLASLT_EPILOGUE_RELU_AUX_BIAS + : CUBLASLT_EPILOGUE_RELU_BIAS; + } else if (activation == "gelu") { + return enable_auxiliary ? CUBLASLT_EPILOGUE_GELU_AUX_BIAS + : CUBLASLT_EPILOGUE_GELU_BIAS; + } else if (activation == "none") { + return CUBLASLT_EPILOGUE_BIAS; + } else { + PADDLE_ENFORCE_EQ( + true, false, + platform::errors::InvalidArgument( + "The activation attribute of fused_gemm_epilogue op should be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation=%s.", + activation)); + } + } +}; + +template +class FusedGemmEpilogueGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + + const Tensor* dout = ctx.Input("DOut"); + const Tensor* x = ctx.Input("X"); + const Tensor* y = ctx.Input("Y"); + const Tensor* reserve_space = ctx.Input("ReserveSpace"); + + Tensor* dx = ctx.Output("DX"); + Tensor* dy = ctx.Output("DY"); + Tensor* dbias = ctx.Output("DBias"); + + std::string activation_grad = ctx.Attr("activation_grad"); + + auto dout_mat_dims = + phi::flatten_to_2d(dout->dims(), dout->dims().size() - 1); + auto x_mat_dims = phi::flatten_to_2d(x->dims(), x->dims().size() - 1); + + int64_t M = x_mat_dims[0]; + int64_t K = y->dims()[0]; + int64_t N = y->dims()[1]; + + cudaDataType_t mat_type = CUDA_R_32F; + cudaDataType_t scale_type = CUDA_R_32F; + cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; + if (std::is_same::value) { + mat_type = CUDA_R_16F; + scale_type = CUDA_R_16F; + } + if (std::is_same::value) { + mat_type = CUDA_R_64F; + scale_type = CUDA_R_64F; + compute_type = CUBLAS_COMPUTE_64F; + } + + cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle(); + size_t workspace_size = 4 * 1024 * 1024; + const cublasLtMatmulAlgo_t* algo = nullptr; + cudaStream_t stream = dev_ctx.stream(); + + double alpha64 = 1.0, beta64 = 0.0; + float alpha32 = 1.0f, beta32 = 0.0f; + void *alpha = nullptr, *beta = nullptr; + if (std::is_same::value) { + alpha = &alpha64; + beta = &beta64; + } else { + alpha = &alpha32; + beta = &beta32; + } + + cublasOperation_t trans_dout = CUBLAS_OP_N; + cublasLtMatrixLayout_t dout_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &dout_desc, mat_type, N, M, N)); + + if (dx) { + cublasLtMatmulDesc_t dx_operation_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate( + &dx_operation_desc, compute_type, scale_type)); + cublasOperation_t trans_y = CUBLAS_OP_T; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_dout, + sizeof(trans_dout))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_y, + sizeof(trans_y))); + cublasLtEpilogue_t epiloque_func_for_dx = + get_epilogue_type_(activation_grad); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, + &epiloque_func_for_dx, sizeof(epiloque_func_for_dx))); + + if (activation_grad != "none") { + auto* aux_data = reserve_space->data(); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, + &aux_data, sizeof(aux_data))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N, + sizeof(N))); + } + + cublasLtMatrixLayout_t y_desc = NULL, dx_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &y_desc, mat_type, N, K, N)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &dx_desc, mat_type, K, M, K)); + + memory::allocation::AllocationPtr dx_workspace = + memory::Alloc(dev_ctx, workspace_size); + + dx->mutable_data(ctx.GetPlace()); + auto* dx_data = dx->data(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul( + lt_handle, dx_operation_desc, alpha, y->data(), y_desc, + dout->data(), dout_desc, beta, dx_data, dx_desc, dx_data, dx_desc, + algo, dx_workspace->ptr(), workspace_size, stream)); + } + + if (dy) { + cublasLtMatmulDesc_t dy_operation_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate( + &dy_operation_desc, compute_type, scale_type)); + cublasOperation_t trans_x = CUBLAS_OP_T; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_dout, + sizeof(trans_dout))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_x, + sizeof(trans_x))); + cublasLtEpilogue_t epiloque_func_for_dy = dbias == nullptr + ? CUBLASLT_EPILOGUE_DEFAULT + : CUBLASLT_EPILOGUE_BGRADA; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dy_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, + &epiloque_func_for_dy, sizeof(epiloque_func_for_dy))); + + if (dbias) { + dbias->mutable_data(ctx.GetPlace()); + auto* dbias_data = dbias->data(); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dy_operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, + &dbias_data, sizeof(dbias_data))); + } + + cublasLtMatrixLayout_t x_desc = NULL, dy_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &x_desc, mat_type, K, M, K)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &dy_desc, mat_type, N, K, N)); + + memory::allocation::AllocationPtr dy_workspace = + memory::Alloc(dev_ctx, workspace_size); + + dy->mutable_data(ctx.GetPlace()); + auto* dy_data = dy->data(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul( + lt_handle, dy_operation_desc, alpha, dout->data(), dout_desc, + x->data(), x_desc, beta, dy_data, dy_desc, dy_data, dy_desc, algo, + dy_workspace->ptr(), workspace_size, stream)); + } + } + + private: + static cublasLtEpilogue_t get_epilogue_type_( + const std::string& activation_grad) { + if (activation_grad == "relu_grad") { + return CUBLASLT_EPILOGUE_DRELU; + } else if (activation_grad == "gelu_grad") { + return CUBLASLT_EPILOGUE_DGELU; + } else if (activation_grad == "none") { + return CUBLASLT_EPILOGUE_DEFAULT; + } else { + PADDLE_ENFORCE_EQ( + true, false, + platform::errors::InvalidArgument( + "The activation_grad attribute of fused_gemm_epilogue op should " + "be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation_grad=%s.", + activation_grad)); + } + } +}; + +} // namespace operators +} // namespace paddle + +#if CUDA_VERSION >= 11060 +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + fused_gemm_epilogue, + ops::FusedGemmEpilogueKernel, + ops::FusedGemmEpilogueKernel, + ops::FusedGemmEpilogueKernel); + +REGISTER_OP_CUDA_KERNEL( + fused_gemm_epilogue_grad, + ops::FusedGemmEpilogueGradKernel, + ops::FusedGemmEpilogueGradKernel, + ops::FusedGemmEpilogueGradKernel); +#endif diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h index ceba3accca7727b5e4f22951d87f9e91034e3403..d53a24a57e3cc1ede127f497a9be9e3b5fa1ab0b 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h @@ -42,12 +42,12 @@ __device__ void CalcLayernormY( const LayerNormScaleBiasT *bias, const T *x, T *y, const int row_id, const int col_id, const int cols, const LayerNormParamType mean_val, const LayerNormParamType invvar) { - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using LoadU = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using LoadU = phi::AlignedVector; using LoadScaleOrBias = - platform::AlignedVector, - VecSize>; + phi::AlignedVector, + VecSize>; for (int i = col_id * VecSize; i < cols; i += blockDim.x * VecSize) { LoadScaleOrBias scale_vec; LoadScaleOrBias bias_vec; @@ -60,15 +60,15 @@ __device__ void CalcLayernormY( static_cast>(0); } // vectorize load data from global - platform::Load(&x[row_id * cols + i], &x_vec); + phi::Load(&x[row_id * cols + i], &x_vec); if (scale != nullptr) { - platform::Load, - VecSize>(&scale[i], &scale_vec); + phi::Load, VecSize>( + &scale[i], &scale_vec); } if (bias != nullptr) { - platform::Load, - VecSize>(&bias[i], &bias_vec); + phi::Load, VecSize>( + &bias[i], &bias_vec); } StoreT y_vec; @@ -78,7 +78,7 @@ __device__ void CalcLayernormY( (static_cast(x_vec[ii]) - mean_val) * invvar + static_cast(bias_vec[ii])); } - platform::Store(y_vec, &y[row_id * cols + i]); + phi::Store(y_vec, &y[row_id * cols + i]); } } @@ -190,9 +190,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( const ScaleT *__restrict__ beta_ptr, MaskType *__restrict__ mask_out_ptr, U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr, T *__restrict__ residual_out_ptr, T *__restrict__ y_ptr) { - using Vec = platform::AlignedVector; - using Vec_scale = platform::AlignedVector; - using MaskStoreT = platform::AlignedVector; + using Vec = phi::AlignedVector; + using Vec_scale = phi::AlignedVector; + using MaskStoreT = phi::AlignedVector; const int tidx = threadIdx.x; const int bidx = blockIdx.x; @@ -214,8 +214,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( Vec_scale beta[LDGS]; #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Load(gamma_ptr + col * VecSize, &gamma[it]); - platform::Load(beta_ptr + col * VecSize, &beta[it]); + phi::Load(gamma_ptr + col * VecSize, &gamma[it]); + phi::Load(beta_ptr + col * VecSize, &beta[it]); col += THREADS_PER_ROW; } @@ -225,10 +225,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( Vec residual[LDGS]; #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, - &x[it]); - platform::Load( - residual_ptr + row * LN_NUM_COLS + col * VecSize, &residual[it]); + phi::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]); + phi::Load(residual_ptr + row * LN_NUM_COLS + col * VecSize, + &residual[it]); col += THREADS_PER_ROW; } @@ -270,9 +269,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( // store dropout_residual_out and mask_out #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Store( + phi::Store( x[it], residual_out_ptr + row * LN_NUM_COLS + col * VecSize); - platform::Store( + phi::Store( mask_vec[it], mask_out_ptr + row * LN_NUM_COLS + col * VecSize); col += THREADS_PER_ROW; } @@ -333,8 +332,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Store(x[it], - y_ptr + row * LN_NUM_COLS + col * VecSize); + phi::Store(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize); col += THREADS_PER_ROW; } } diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu index cc14d0680d381ff2bbe73ee712e218c9c4d79185..032440d7f0478dc087e3ba38274f2a31a9a66a23 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu @@ -19,6 +19,12 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_dropout_test.h" #include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h" +#include "paddle/phi/core/kernel_registry.h" + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT); +#endif /** * @brief The unit test of fused_layernorm_residual_dropout_bias diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h index 1b135ad6098e58f457f5d21e73ac6d1a6a7c4074..1d3085a013f81ee9dca21468476df8f621bb26c2 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h @@ -32,9 +32,9 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread( const T *__restrict__ bias, T *dst, MaskType *mask, const bool is_test, typename details::MPTypeTrait::Type *mean_val, typename details::MPTypeTrait::Type *var_val, Functor act_func) { - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using MaskStoreT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using MaskStoreT = phi::AlignedVector; using U = typename details::MPTypeTrait::Type; LoadT src_vec; @@ -46,14 +46,13 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread( residual_vec[ii] = static_cast(0); } // vectorize load data from global - platform::Load(&src[row_id * cols + col_id], &src_vec); + phi::Load(&src[row_id * cols + col_id], &src_vec); if (residual) { - platform::Load(&residual[row_id * cols + col_id], - &residual_vec); + phi::Load(&residual[row_id * cols + col_id], &residual_vec); } if (bias) { - platform::Load(&bias[col_id], &bias_vec); + phi::Load(&bias[col_id], &bias_vec); } MaskStoreT mask_vec; @@ -89,9 +88,9 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread( } // store result to global - platform::Store(dest_vec, &dst[row_id * cols + col_id]); + phi::Store(dest_vec, &dst[row_id * cols + col_id]); if (!is_test) { - platform::Store(mask_vec, &mask[row_id * cols + col_id]); + phi::Store(mask_vec, &mask[row_id * cols + col_id]); } } @@ -176,21 +175,21 @@ __global__ void FusedResidualDropoutGrad(const T *dout, const MaskType *mask, T *dx) { int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) { LoadT dout_vec; MaskLoadT mask_vec; - platform::Load(&dout[i], &dout_vec); - platform::Load(&mask[i], &mask_vec); + phi::Load(&dout[i], &dout_vec); + phi::Load(&mask[i], &mask_vec); StoreT dx_vec; #pragma unroll for (int ii = 0; ii < VecSize; ii++) { dx_vec[ii] = dout_vec[ii] * static_cast(mask_vec[ii]) * factor; } - platform::Store(dx_vec, &dx[i]); + phi::Store(dx_vec, &dx[i]); } } @@ -209,9 +208,9 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout, T *dbias) { int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x; - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; T tmp_sum[VecSize] = {static_cast(0)}; // calculate the dx and temporary sum @@ -221,8 +220,8 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout, LoadT out_vec; MaskLoadT mask_vec; StoreT dx_vec; - platform::Load(&dout[index], &out_vec); - platform::Load(&mask[index], &mask_vec); + phi::Load(&dout[index], &out_vec); + phi::Load(&mask[index], &mask_vec); #pragma unroll for (int i = 0; i < VecSize; i++) { @@ -230,7 +229,7 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout, tmp_sum[i] += out_vec[i]; } - platform::Store(dx_vec, &dx[index]); + phi::Store(dx_vec, &dx[index]); } } diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu index 1a12e6b565f02035b3fb9673636c2344823f288e..5dff5e2225f4f3bf3a20daa02b2b4194bd8cb99e 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu @@ -19,6 +19,12 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_dropout_test.h" #include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h" +#include "paddle/phi/core/kernel_registry.h" + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT); +#endif namespace framework = paddle::framework; namespace platform = paddle::platform; diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index 41a69031c54b31cd7e67ce428e710b3a87081f48..3311e3b4ebc9e21d0a033e54ba162e72a80326d0 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -19,8 +19,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/fc.h" -#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/sequence2batch.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -368,7 +368,7 @@ class FusionGRUKernel : public framework::OpKernel { hidden_out->mutable_data(place); auto& dev_ctx = ctx.template device_context(); auto blas = phi::funcs::GetBlas(dev_ctx); - math::LoDTensor2BatchFunctor to_batch; + phi::funcs::LoDTensor2BatchFunctor to_batch; math::FCFunctor fc; if (M > D3) { @@ -463,7 +463,7 @@ class FusionGRUKernel : public framework::OpKernel { batched_input_data = cur_batched_data; } - math::Batch2LoDTensorFunctor to_seq; + phi::funcs::Batch2LoDTensorFunctor to_seq; batched_out->set_lod(batched_lod); to_seq(dev_ctx, *batched_out, hidden_out); } diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index 06d406867f07431999f11d76e907a75fcc917ff2..00be8b09d1296018f36c0299f415b7c27f0fad14 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -16,8 +16,8 @@ limitations under the License. */ #include #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/fc.h" -#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/sequence2batch.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -421,7 +421,7 @@ class FuisonLSTMKernel : public framework::OpKernel { hidden_out->mutable_data(place); cell_out->mutable_data(place); - math::LoDTensor2BatchFunctor to_batch; + phi::funcs::LoDTensor2BatchFunctor to_batch; auto& dev_ctx = ctx.template device_context(); auto blas = phi::funcs::GetBlas(dev_ctx); math::FCFunctor fc; @@ -514,7 +514,7 @@ class FuisonLSTMKernel : public framework::OpKernel { batched_input_data = cur_in_data; } - math::Batch2LoDTensorFunctor to_seq; + phi::funcs::Batch2LoDTensorFunctor to_seq; batched_h_out->set_lod(batched_lod); to_seq(dev_ctx, *batched_h_out, hidden_out); batched_c_out->set_lod(batched_lod); diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc index 88fb7349d538afd6d7bf4fa6947ac21307db66d8..1000d0488dc3ffcf6cde977be47ce77d2bc947a7 100644 --- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc @@ -14,10 +14,10 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h" #include -#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/cpu_vec.h" namespace paddle { namespace operators { @@ -196,10 +196,10 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel { std::function fc_act; auto& fc_act_str = ctx.Attr("fc_activation"); if (platform::MayIUse(platform::avx)) { - math::VecActivations act_functor; + phi::funcs::VecActivations act_functor; fc_act = act_functor(fc_act_str); } else { - math::VecActivations act_functor; + phi::funcs::VecActivations act_functor; fc_act = act_functor(fc_act_str); } diff --git a/paddle/fluid/operators/fused/multi_gru_op.cc b/paddle/fluid/operators/fused/multi_gru_op.cc index 84826ff3993ff7a746d34294311c9b8b429f5ea6..c2260c53b2edd09dd69d126bc5e61b995fb20467 100644 --- a/paddle/fluid/operators/fused/multi_gru_op.cc +++ b/paddle/fluid/operators/fused/multi_gru_op.cc @@ -19,8 +19,8 @@ limitations under the License. */ #include #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/fc.h" -#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/sequence2batch.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc index 8da900d84f9bcedd5e4b318837fe1bb29697a6be..e5ca15a39ef51f7807246c2ee1d473a0499b6463 100644 --- a/paddle/fluid/operators/gather_nd_op.cc +++ b/paddle/fluid/operators/gather_nd_op.cc @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather_nd_op.h" -#include -#include -#include -#include "paddle/phi/core/ddim.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/binary.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -25,48 +25,10 @@ class GatherNdOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of GatherNdOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true, - platform::errors::InvalidArgument( - "Input(Index) of GatherNdOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of GatherNdOp should not be null.")); - - auto x_dims = ctx->GetInputDim("X"); - auto x_dims_size = x_dims.size(); - auto index_dims = ctx->GetInputDim("Index"); - auto index_dims_size = index_dims.size(); - - PADDLE_ENFORCE_LE( - index_dims[index_dims_size - 1], x_dims_size, - platform::errors::InvalidArgument( - "Input(Index).shape[-1] should be no greater than Input(X).rank")); - PADDLE_ENFORCE_GE(index_dims_size, 1UL, - platform::errors::InvalidArgument( - "The rank of Input(Index) should be greater than 1")); - - std::vector result_dims; - // The result dims is - // Index.shape[:-1] + X.shape[Index.shape[-1]:] - for (int i = 0; i < index_dims_size - 1; ++i) { - result_dims.emplace_back(index_dims[i]); - } - for (int i = index_dims[index_dims_size - 1]; i < x_dims_size; ++i) { - result_dims.emplace_back(x_dims[i]); - } - - ctx->SetOutputDim("Out", phi::make_ddim(result_dims)); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); + auto* x = ctx.Input("X"); const auto& x_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); return framework::OpKernelType( x_type, @@ -80,11 +42,6 @@ class GatherNdGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -173,23 +130,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherNdGradNoNeedBufferVarInferer, "X"); namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(gather_nd, GatherNdInferShapeFunctor, + PD_INFER_META(phi::GatherNdInferMeta)); + +DECLARE_INFER_SHAPE_FUNCTOR(gather_nd_grad, GatherNdGradInferShapeFunctor, + PD_INFER_META(phi::GatherNdGradInferMeta)); + REGISTER_OPERATOR(gather_nd, ops::GatherNdOp, ops::GatherNdOpMaker, ops::GatherNdGradOpMaker, - ops::GatherNdGradOpMaker); + ops::GatherNdGradOpMaker, + GatherNdInferShapeFunctor); REGISTER_OPERATOR(gather_nd_grad, ops::GatherNdGradOp, - ops::GatherNdGradNoNeedBufferVarInferer); - -REGISTER_OP_CPU_KERNEL(gather_nd, ops::GatherNdOpKernel, - ops::GatherNdOpKernel, - ops::GatherNdOpKernel, - ops::GatherNdOpKernel, - ops::GatherNdOpKernel, - ops::GatherNdOpKernel, - ops::GatherNdOpKernel); - -REGISTER_OP_CPU_KERNEL(gather_nd_grad, ops::GatherNdGradOpKernel, - ops::GatherNdGradOpKernel, - ops::GatherNdGradOpKernel, - ops::GatherNdGradOpKernel, - ops::GatherNdGradOpKernel); + ops::GatherNdGradNoNeedBufferVarInferer, + GatherNdGradInferShapeFunctor); diff --git a/paddle/fluid/operators/gather_nd_op.cu b/paddle/fluid/operators/gather_nd_op.cu deleted file mode 100644 index 0de2798bf750915e99c9b60ed8ccb94d7d1201ab..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gather_nd_op.cu +++ /dev/null @@ -1,115 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/operators/gather.cu.h" -#include "paddle/fluid/operators/gather_nd_op.h" -#include "paddle/fluid/operators/scatter.cu.h" - -namespace paddle { -namespace operators { - -template -class GatherNdOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *output = ctx.Output("Out"); - - output->mutable_data(ctx.GetPlace()); - if (x->numel() == 0) return; - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s], but " - "desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - GPUGatherNd(ctx, *x, *index, output); - } else if (index_type == framework::proto::VarType::INT64) { - GPUGatherNd(ctx, *x, *index, output); - } - } -}; - -template -class GatherNdGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *index = ctx.Input("Index"); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dO = ctx.Input(framework::GradVarName("Out")); - - dX->mutable_data(ctx.GetPlace()); - auto dxt = framework::EigenVector::Flatten(*dX); - auto &place = *ctx.template device_context() - .eigen_device(); - dxt.device(place) = dxt.constant(static_cast(0)); - if (dO->numel() == 0) return; - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - if (index_type == framework::proto::VarType::INT32) { - GPUScatterNdAdd(ctx, *dO, *index, dX); - } else if (index_type == framework::proto::VarType::INT64) { - GPUScatterNdAdd(ctx, *dO, *index, dX); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -using CUDA = paddle::platform::CUDADeviceContext; -REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL(gather_nd_grad, - ops::GatherNdGradOpCUDAKernel, - ops::GatherNdGradOpCUDAKernel, - ops::GatherNdGradOpCUDAKernel, - ops::GatherNdGradOpCUDAKernel, - ops::GatherNdGradOpCUDAKernel); diff --git a/paddle/fluid/operators/gather_nd_op.h b/paddle/fluid/operators/gather_nd_op.h deleted file mode 100644 index f458c0e18013b4d7a85d960e0e7df1b2d21638fe..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gather_nd_op.h +++ /dev/null @@ -1,99 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" -#include "paddle/fluid/operators/scatter.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class GatherNdOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *output = ctx.Output("Out"); - - output->mutable_data(ctx.GetPlace()); - if (x->numel() == 0) return; - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s]", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - CPUGatherNd(ctx.device_context(), *x, *index, output); - } else if (index_type == framework::proto::VarType::INT64) { - CPUGatherNd(ctx.device_context(), *x, *index, output); - } - } -}; - -template -class GatherNdGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - auto *index = ctx.Input("Index"); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dO = ctx.Input(framework::GradVarName("Out")); - dX->mutable_data(ctx.GetPlace()); - auto dxt = framework::EigenVector::Flatten(*dX); - auto &place = *ctx.template device_context() - .eigen_device(); - dxt.device(place) = dxt.constant(static_cast(0)); - if (dO->numel() == 0) return; - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s]", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - ScatterNdAdd(ctx, *dO, *index, dX); - } else if (index_type == framework::proto::VarType::INT64) { - ScatterNdAdd(ctx, *dO, *index, dX); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/gather_nd_op_npu.cc b/paddle/fluid/operators/gather_nd_op_npu.cc index 995ab5d0ddf0fda19a163ec31a00a14985b5dbb9..c916f44b874a08a13fb967aae1f8b6a136023b31 100644 --- a/paddle/fluid/operators/gather_nd_op_npu.cc +++ b/paddle/fluid/operators/gather_nd_op_npu.cc @@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather_nd_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/gather_nd_op_xpu.cc b/paddle/fluid/operators/gather_nd_op_xpu.cc index 9f4c522bd145bedd09fd746781cef5efec15c139..d4cb799e825b640a2a4e0a464e18d63c5e5ed516 100644 --- a/paddle/fluid/operators/gather_nd_op_xpu.cc +++ b/paddle/fluid/operators/gather_nd_op_xpu.cc @@ -11,7 +11,10 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/gather_nd_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { @@ -20,9 +23,9 @@ template class GatherNdXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *out = ctx.Output("Out"); + auto *x = ctx.Input("X"); + auto *index = ctx.Input("Index"); + auto *out = ctx.Output("Out"); out->template mutable_data(ctx.GetPlace()); if (x->numel() == 0) return; diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index cf4d7b1d670b8add6ff5a138851c6a23ee54169e..8a405cc6fc1baefe997fb5b6133a56d6a2fc0438 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -201,12 +201,14 @@ REGISTER_OPERATOR(gather_grad, ops::GatherGradOp, REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel, ops::GatherOpKernel, ops::GatherOpKernel, ops::GatherOpKernel, - ops::GatherOpKernel); + ops::GatherOpKernel, + ops::GatherOpKernel); REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel, ops::GatherGradientOpKernel, ops::GatherGradientOpKernel, ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel); + ops::GatherGradientOpKernel, + ops::GatherGradientOpKernel); REGISTER_OP_VERSION(gather) .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC", paddle::framework::compatible::OpVersionDesc().NewInput( diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu index 19568835a6e96080bb1c0af642bf9cb19c346bf9..e0db2f26d3e0534f924cc709b98689fb3f1a5cc6 100644 --- a/paddle/fluid/operators/gather_op.cu +++ b/paddle/fluid/operators/gather_op.cu @@ -14,9 +14,9 @@ limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/gather_op.h" -#include "paddle/fluid/operators/scatter.cu.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" +#include "paddle/phi/kernels/funcs/scatter.cu.h" namespace paddle { namespace operators { @@ -45,15 +45,23 @@ class GatherOpCUDAKernel : public framework::OpKernel { axis = static_cast(cpu_axis.data()[0]); } else if (axis_type == framework::proto::VarType::INT64) { axis = static_cast(cpu_axis.data()[0]); + } else if (axis_type == framework::proto::VarType::INT16) { + axis = static_cast(cpu_axis.data()[0]); } } const auto &place = ctx.GetPlace(); const auto &index_type = framework::TransToProtoVarType(index->dtype()); + const auto &dev_ctx = ctx.cuda_device_context(); if (axis != 0) { if (index_type == framework::proto::VarType::INT32) { - GatherV2CUDAFunction(x, index, axis, output, place, ctx); + phi::funcs::GatherV2CUDAFunction(x, index, axis, output, + dev_ctx); } else if (index_type == framework::proto::VarType::INT64) { - GatherV2CUDAFunction(x, index, axis, output, place, ctx); + phi::funcs::GatherV2CUDAFunction(x, index, axis, output, + dev_ctx); + } else if (index_type == framework::proto::VarType::INT16) { + phi::funcs::GatherV2CUDAFunction(x, index, axis, output, + dev_ctx); } return; } @@ -61,9 +69,11 @@ class GatherOpCUDAKernel : public framework::OpKernel { output->mutable_data(ctx.GetPlace()); if (x->numel() == 0) return; if (index_type == framework::proto::VarType::INT32) { - GPUGather(ctx.device_context(), *x, *index, output); + phi::funcs::GPUGather(dev_ctx, *x, *index, output); } else if (index_type == framework::proto::VarType::INT64) { - GPUGather(ctx.device_context(), *x, *index, output); + phi::funcs::GPUGather(dev_ctx, *x, *index, output); + } else if (index_type == framework::proto::VarType::INT16) { + phi::funcs::GPUGather(dev_ctx, *x, *index, output); } } }; @@ -93,14 +103,15 @@ class GatherGradOpCUDAKernel : public framework::OpKernel { } } + const auto &dev_ctx = ctx.cuda_device_context(); const auto &index_type = framework::TransToProtoVarType(index->dtype()); if (axis != 0) { if (index_type == framework::proto::VarType::INT32) { - GatherV2GradCUDAFunction(dO, index, axis, dX, - ctx.GetPlace(), ctx); + phi::funcs::GatherV2GradCUDAFunction(dO, index, axis, dX, + dev_ctx); } else if (index_type == framework::proto::VarType::INT64) { - GatherV2GradCUDAFunction(dO, index, axis, dX, - ctx.GetPlace(), ctx); + phi::funcs::GatherV2GradCUDAFunction(dO, index, axis, dX, + dev_ctx); } return; } @@ -112,11 +123,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel { dxt.device(place) = dxt.constant(static_cast(0)); if (dO->numel() == 0) return; if (index_type == framework::proto::VarType::INT32) { - GPUScatterAssign(ctx, *dO, *index, dX, - ctx.Attr("overwrite")); + phi::funcs::GPUScatterAssign(dev_ctx, *dO, *index, dX, + ctx.Attr("overwrite")); } else if (index_type == framework::proto::VarType::INT64) { - GPUScatterAssign(ctx, *dO, *index, dX, - ctx.Attr("overwrite")); + phi::funcs::GPUScatterAssign(dev_ctx, *dO, *index, dX, + ctx.Attr("overwrite")); } } }; @@ -130,9 +141,12 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel); + ops::GatherOpCUDAKernel, + ops::GatherOpCUDAKernel, + ops::GatherOpCUDAKernel); REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel, ops::GatherGradOpCUDAKernel, ops::GatherGradOpCUDAKernel, ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel); + ops::GatherGradOpCUDAKernel, + ops::GatherGradOpCUDAKernel); diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h index 016c2b398daaad92ec60e37606345e0c6c4e13f5..94de694b2f9bc484cdb60298b60d5a9433dac181 100644 --- a/paddle/fluid/operators/gather_op.h +++ b/paddle/fluid/operators/gather_op.h @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" -#include "paddle/fluid/operators/scatter.h" +#include "paddle/phi/kernels/funcs/gather.h" +#include "paddle/phi/kernels/funcs/scatter.h" namespace paddle { namespace operators { @@ -40,31 +40,32 @@ class GatherOpKernel : public framework::OpKernel { // get axis from tensor if (ctx.HasInput("Axis")) { const Tensor *axis_tensor = ctx.Input("Axis"); - const auto &axis_type = - framework::TransToProtoVarType(axis_tensor->dtype()); - if (axis_type == framework::proto::VarType::INT32) { + const auto &axis_type = axis_tensor->dtype(); + if (axis_type == phi::DataType::INT32) { axis = static_cast(axis_tensor->data()[0]); - } else if (axis_type == framework::proto::VarType::INT64) { + } else if (axis_type == phi::DataType::INT64) { axis = static_cast(axis_tensor->data()[0]); } } - const auto &place = ctx.GetPlace(); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); + const auto &index_type = index->dtype(); + auto &dev_ctx = ctx.template device_context(); if (axis != 0) { - if (index_type == framework::proto::VarType::INT32) { - GatherV2Function(x, index, axis, output, place); - } else if (index_type == framework::proto::VarType::INT64) { - GatherV2Function(x, index, axis, output, place); + if (index_type == phi::DataType::INT32) { + phi::funcs::GatherV2Function(dev_ctx, x, index, axis, + output); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GatherV2Function(dev_ctx, x, index, axis, + output); } return; } output->mutable_data(ctx.GetPlace()); if (x->numel() == 0) return; - if (index_type == framework::proto::VarType::INT32) { - CPUGather(ctx.device_context(), *x, *index, output); - } else if (index_type == framework::proto::VarType::INT64) { - CPUGather(ctx.device_context(), *x, *index, output); + if (index_type == phi::DataType::INT32) { + phi::funcs::CPUGather(dev_ctx, *x, *index, output); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::CPUGather(dev_ctx, *x, *index, output); } } }; @@ -84,44 +85,45 @@ class GatherGradientOpKernel : public framework::OpKernel { int axis = ctx.Attr("axis"); if (ctx.HasInput("Axis")) { const Tensor *axis_tensor = ctx.Input("Axis"); - const auto &axis_type = - framework::TransToProtoVarType(axis_tensor->dtype()); - if (axis_type == framework::proto::VarType::INT32) { + const auto &axis_type = axis_tensor->dtype(); + if (axis_type == phi::DataType::INT32) { axis = static_cast(axis_tensor->data()[0]); - } else if (axis_type == framework::proto::VarType::INT64) { + } else if (axis_type == phi::DataType::INT64) { axis = static_cast(axis_tensor->data()[0]); } } - const auto &index_type = framework::TransToProtoVarType(index->dtype()); + const auto &index_type = index->dtype(); + auto &dev_ctx = ctx.template device_context(); if (axis != 0) { - if (index_type == framework::proto::VarType::INT32) { - GatherV2GradFunction(dO, index, axis, dX, ctx.GetPlace()); - } else if (index_type == framework::proto::VarType::INT64) { - GatherV2GradFunction(dO, index, axis, dX, ctx.GetPlace()); + if (index_type == phi::DataType::INT32) { + phi::funcs::GatherV2GradFunction(dev_ctx, dO, index, axis, + dX); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GatherV2GradFunction(dev_ctx, dO, index, axis, + dX); } return; } dX->mutable_data(ctx.GetPlace()); auto dxt = framework::EigenVector::Flatten(*dX); - auto &place = *ctx.template device_context() - .eigen_device(); + auto &place = *dev_ctx.eigen_device(); dxt.device(place) = dxt.constant(static_cast(0)); if (dO->numel() == 0) return; bool overwrite = ctx.Attr("overwrite"); - if (index_type == framework::proto::VarType::INT32) { + if (index_type == phi::DataType::INT32) { if (overwrite) { - ScatterAssign(ctx.device_context(), *dO, *index, dX); + phi::funcs::ScatterAssign(dev_ctx, *dO, *index, dX); } else { - ScatterAssignAdd(ctx, *dO, *index, dX); + phi::funcs::ScatterAssignAdd(dev_ctx, *dO, *index, dX); } - } else if (index_type == framework::proto::VarType::INT64) { + } else if (index_type == phi::DataType::INT64) { if (overwrite) { - ScatterAssign(ctx.device_context(), *dO, *index, dX); + phi::funcs::ScatterAssign(dev_ctx, *dO, *index, dX); } else { - ScatterAssignAdd(ctx, *dO, *index, dX); + phi::funcs::ScatterAssignAdd(dev_ctx, *dO, *index, dX); } } } diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc index 0f3dcdadcf897dc05d131225cdffe11f84043c14..c962dd065234f37fe98481c9866f7d2f405db69c 100644 --- a/paddle/fluid/operators/gather_test.cc +++ b/paddle/fluid/operators/gather_test.cc @@ -15,8 +15,8 @@ limitations under the License. */ #include #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/gather.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/kernels/funcs/gather.h" TEST(Gather, GatherData) { paddle::framework::Tensor* src = new paddle::framework::Tensor(); @@ -39,7 +39,7 @@ TEST(Gather, GatherData) { auto* cpu_place = new paddle::platform::CPUPlace(); paddle::platform::CPUDeviceContext ctx(*cpu_place); - paddle::operators::CPUGather(ctx, *src, *index, output); + phi::funcs::CPUGather(ctx, *src, *index, output); delete cpu_place; cpu_place = NULL; for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4); diff --git a/paddle/fluid/operators/gather_tree_op.cc b/paddle/fluid/operators/gather_tree_op.cc index 830134e57e0e72c5470ac79714015a94df9888bf..c84e94f5c71277c4fe8f25b73b266169f0d0877a 100644 --- a/paddle/fluid/operators/gather_tree_op.cc +++ b/paddle/fluid/operators/gather_tree_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather_tree_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -21,20 +24,6 @@ class GatherTreeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "GatherTree"); - OP_INOUT_CHECK(ctx->HasInput("Parents"), "Input", "Parents", "GatherTree"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GatherTree"); - - auto ids_dims = ctx->GetInputDim("Ids"); - auto parents_dims = ctx->GetInputDim("Parents"); - PADDLE_ENFORCE_EQ(ids_dims == parents_dims, true, - platform::errors::InvalidArgument( - "The shape of Input(Parents) must be same with the " - "shape of Input(Ids).")); - ctx->SetOutputDim("Out", ids_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -72,6 +61,8 @@ selected ids. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker); -REGISTER_OP_CPU_KERNEL(gather_tree, ops::GatherTreeOpKernel, - ops::GatherTreeOpKernel); +DECLARE_INFER_SHAPE_FUNCTOR(gather_tree, GatherTreeInferShapeFunctor, + PD_INFER_META(phi::GatherTreeMeta)); + +REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker, + GatherTreeInferShapeFunctor); diff --git a/paddle/fluid/operators/gather_tree_op.cu b/paddle/fluid/operators/gather_tree_op.cu deleted file mode 100644 index 829682764a674db93728413b07133a41e72246b4..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gather_tree_op.cu +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather_tree_op.h" - -namespace paddle { -namespace operators { - -template -__global__ void GatherTree(const T *ids_data, const T *parents_data, - T *out_data, const int64_t max_length, - const int64_t batch_size, const int64_t beam_size) { - CUDA_KERNEL_LOOP(i, batch_size * beam_size) { - int batch = i / beam_size; - int beam = i % beam_size; - auto idx = - (max_length - 1) * batch_size * beam_size + batch * beam_size + beam; - out_data[idx] = ids_data[idx]; - auto parent = parents_data[idx]; - for (int step = max_length - 2; step >= 0; step--) { - idx = step * batch_size * beam_size + batch * beam_size; - out_data[idx + beam] = ids_data[idx + parent]; - parent = parents_data[idx + parent]; - } - } -} - -template -class GatherTreeOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *ids = ctx.Input("Ids"); - auto *parents = ctx.Input("Parents"); - auto *out = ctx.Output("Out"); - - const auto *ids_data = ids->data(); - const auto *parents_data = parents->data(); - auto *out_data = out->mutable_data(ctx.GetPlace()); - - PADDLE_ENFORCE_NOT_NULL( - ids_data, platform::errors::InvalidArgument( - "Input(Ids) of gather_tree should not be null.")); - - PADDLE_ENFORCE_NOT_NULL( - parents_data, platform::errors::InvalidArgument( - "Input(Parents) of gather_tree should not be null.")); - - auto &ids_dims = ids->dims(); - int64_t max_length = ids_dims[0]; - int64_t batch_size = ids_dims[1]; - int64_t beam_size = ids_dims[2]; - - auto &dev_ctx = ctx.cuda_device_context(); - - const int block = 512; - int max_threads = - std::min(static_cast(dev_ctx.GetMaxPhysicalThreadCount()), - batch_size * beam_size); - const int grid = std::max(max_threads / block, 1); - GatherTree<<>>(ids_data, parents_data, out_data, max_length, - batch_size, beam_size); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL(gather_tree, ops::GatherTreeOpCUDAKernel, - ops::GatherTreeOpCUDAKernel); diff --git a/paddle/fluid/operators/gather_tree_op.h b/paddle/fluid/operators/gather_tree_op.h deleted file mode 100644 index e035a30e7954feaf06f197211b2a2ca266cfd473..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gather_tree_op.h +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class GatherTreeOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *ids = ctx.Input("Ids"); - auto *parents = ctx.Input("Parents"); - auto *out = ctx.Output("Out"); - - const auto *ids_data = ids->data(); - const auto *parents_data = parents->data(); - auto *out_data = out->mutable_data(ctx.GetPlace()); - - auto &ids_dims = ids->dims(); - auto max_length = ids_dims[0]; - auto batch_size = ids_dims[1]; - auto beam_size = ids_dims[2]; - - PADDLE_ENFORCE_NOT_NULL( - ids_data, platform::errors::InvalidArgument( - "Input(Ids) of gather_tree should not be null.")); - - PADDLE_ENFORCE_NOT_NULL( - parents_data, platform::errors::InvalidArgument( - "Input(Parents) of gather_tree should not be null.")); - - for (int batch = 0; batch < batch_size; batch++) { - for (int beam = 0; beam < beam_size; beam++) { - auto idx = (max_length - 1) * batch_size * beam_size + - batch * beam_size + beam; - out_data[idx] = ids_data[idx]; - auto parent = parents_data[idx]; - for (int step = max_length - 2; step >= 0; step--) { - idx = step * batch_size * beam_size + batch * beam_size; - out_data[idx + beam] = ids_data[idx + parent]; - parent = parents_data[idx + parent]; - } - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc index 774ff0bd065995916562061784f5218336a9da93..66eecc13d04d1aa7d4532b69f7a2fbe8c62b8e6f 100644 --- a/paddle/fluid/operators/gaussian_random_op.cc +++ b/paddle/fluid/operators/gaussian_random_op.cc @@ -15,38 +15,19 @@ limitations under the License. */ #include #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/fill_constant_op.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/phi/infermeta/nullary.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -class CPUGaussianRandomKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - float mean = context.Attr("mean"); - float std = context.Attr("std"); - auto* tensor = context.Output("Out"); - - std::normal_distribution dist(mean, std); - auto shape = GetShape(context); - tensor->Resize(shape); - int64_t size = tensor->numel(); - T* data = tensor->mutable_data(context.GetPlace()); - unsigned int seed = static_cast(context.Attr("seed")); - auto engine = framework::GetCPURandomEngine(seed); - - for (int64_t i = 0; i < size; ++i) { - data[i] = dist(*engine); - } - } -}; // namespace operators template class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel { @@ -75,38 +56,6 @@ class GaussianRandomOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GaussianRandom"); - - auto shape = ctx->Attrs().Get>("shape"); - std::vector temp; - temp.reserve(shape.size()); - for (auto dim : shape) { - temp.push_back(static_cast(dim)); - } - if (shape.empty() && ctx->HasInput("ShapeTensor")) { - auto shape_dims = ctx->GetInputDim("ShapeTensor"); - int num_ele = 1; - for (int i = 0; i < shape_dims.size(); ++i) { - num_ele *= shape_dims[i]; - } - auto vec_dims = std::vector(num_ele, -1); - ctx->SetOutputDim("Out", phi::make_ddim(vec_dims)); - - return; - } - if (!ctx->HasInput("ShapeTensor") && !ctx->HasInputs("ShapeTensorList")) { - PADDLE_ENFORCE_GT( - shape.size(), 0UL, - platform::errors::InvalidArgument( - "Attribute(shape) of GaussianRandomOp must be set " - "and shape.size() > 0, but reveived shape.size() is %d", - shape.size())); - } - - ctx->SetOutputDim("Out", phi::make_ddim(temp)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -192,13 +141,20 @@ Used to initialize tensors with gaussian random generator. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp, - ops::GaussianRandomOpMaker); -REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel, - ops::CPUGaussianRandomKernel); + +DECLARE_INFER_SHAPE_FUNCTOR(gaussian_random, GaussianRandomInferShapeFunctor, + PD_INFER_META(phi::GaussianRandomInferMeta)); + +REGISTER_OPERATOR( + gaussian_random, ops::GaussianRandomOp, ops::GaussianRandomOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + GaussianRandomInferShapeFunctor); + REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like, ops::CPUGaussianRandomBatchSizeLikeKernel, ops::CPUGaussianRandomBatchSizeLikeKernel); + REGISTER_OP_VERSION(gaussian_random) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu index 21d827c79200c4a368ce7677b01b18ee4ddedb8d..00ce10bfe3bccb404bce9f681ee3c7030e0fa4c4 100644 --- a/paddle/fluid/operators/gaussian_random_op.cu +++ b/paddle/fluid/operators/gaussian_random_op.cu @@ -19,9 +19,10 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/distribution_helper.h" #include "paddle/fluid/operators/fill_constant_op.h" -#include "paddle/fluid/operators/index_impl.cu.h" + +#include "paddle/phi/kernels/funcs/distribution_helper.h" +#include "paddle/phi/kernels/funcs/index_impl.cu.h" DECLARE_bool(use_curand); @@ -44,7 +45,8 @@ struct GaussianGenerator { thrust::minstd_rand rng; rng.seed(seed_); using MT = typename details::MPTypeTrait::Type; - thrust::normal_distribution dist(mean_, std_); + thrust::normal_distribution dist(static_cast(mean_), + static_cast(std_)); unsigned int new_n = n + offset_; rng.discard(new_n); MT out = dist(rng); @@ -52,53 +54,6 @@ struct GaussianGenerator { } }; -template -class GPUGaussianRandomKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* tensor = context.Output("Out"); - unsigned int seed = static_cast(context.Attr("seed")); - bool seed_flag = false; - if (seed == 0) { - std::random_device rd; - seed = rd(); - seed_flag = true; - } - T mean = static_cast(context.Attr("mean")); - T std = static_cast(context.Attr("std")); - auto shape = GetShape(context); - tensor->Resize(shape); - - auto& dev_cxt = - context.template device_context(); - T* data = tensor->mutable_data(dev_cxt.GetPlace()); - - int64_t size = tensor->numel(); - - int device_id = context.GetPlace().GetDeviceId(); - auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); - - if (gen_cuda->GetIsInitPy() && seed_flag) { - if (FLAGS_use_curand) { - using MT = typename details::MPTypeTrait::Type; - distribution::normal_distribution dist; - distribution::normal_transform trans(mean, std); - distribution::distribution_and_transform(dev_cxt, tensor, dist, - trans); - } else { - auto seed_offset = gen_cuda->IncrementOffset(1); - int64_t gen_offset = size * seed_offset.second; - auto func = - GaussianGenerator(mean, std, seed_offset.first, gen_offset); - IndexKernel>(dev_cxt, tensor, func); - } - } else { - auto func = GaussianGenerator(mean, std, seed); - IndexKernel>(dev_cxt, tensor, func); - } - } -}; - template class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel { public: @@ -126,21 +81,16 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel { int64_t gen_offset = size * seed_offset.second; auto func = GaussianGenerator(mean, std, seed_offset.first, seed_offset.second); - IndexKernel>(dev_cxt, tensor, func); + phi::IndexKernel>(dev_cxt, tensor, func); } else { auto func = GaussianGenerator(mean, std, seed); - IndexKernel>(dev_cxt, tensor, func); + phi::IndexKernel>(dev_cxt, tensor, func); } } }; } // namespace operators } // namespace paddle -REGISTER_OP_CUDA_KERNEL( - gaussian_random, - paddle::operators::GPUGaussianRandomKernel, - paddle::operators::GPUGaussianRandomKernel, - paddle::operators::GPUGaussianRandomKernel); REGISTER_OP_CUDA_KERNEL( gaussian_random_batch_size_like, paddle::operators::GPUGaussianRandomBatchSizeLikeKernel< diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu index 6b778eee4345170a0288bc5741c6c1078615022f..ef836ab72f001a540e081d7e9975ca5ee28758be 100644 --- a/paddle/fluid/operators/gelu_op.cu +++ b/paddle/fluid/operators/gelu_op.cu @@ -58,7 +58,7 @@ static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, __half* y, static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; for (; offset < n; offset += stride) { - using ArrT = platform::AlignedVector<__half, VecSize>; + using ArrT = phi::AlignedVector<__half, VecSize>; ArrT in_arr = *reinterpret_cast(x + offset); #pragma unroll for (int i = 0; i < VecSize; ++i) { @@ -77,7 +77,7 @@ static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x, static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; for (; offset < n; offset += stride) { - using ArrT = platform::AlignedVector<__half, VecSize>; + using ArrT = phi::AlignedVector<__half, VecSize>; ArrT x_in_arr = *reinterpret_cast(x + offset); ArrT y_g_in_arr = *reinterpret_cast(y_g + offset); #pragma unroll @@ -103,7 +103,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( #define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math) \ do { \ constexpr auto kAlignment = \ - alignof(platform::AlignedVector<__half, __vec_size>); \ + alignof(phi::AlignedVector<__half, __vec_size>); \ if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ is_aligned(y, kAlignment)) { \ size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ @@ -138,7 +138,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel( #define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math) \ do { \ constexpr auto kAlignment = \ - alignof(platform::AlignedVector<__half, __vec_size>); \ + alignof(phi::AlignedVector<__half, __vec_size>); \ if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) && \ is_aligned(x_g, kAlignment)) { \ diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc index 00ff7ad2166dcf99d7b60ec45adfe70b478dedf8..f3ac53138328dbfad12c6d530a6517f40c658677 100644 --- a/paddle/fluid/operators/gelu_op_npu_test.cc +++ b/paddle/fluid/operators/gelu_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc index 6af8388d9eba4e4ea8fbb833f84a5c06e182b1f2..f7c006dbcb1a9a23ec619c8d790df8a093530eee 100644 --- a/paddle/fluid/operators/graph_send_recv_op.cc +++ b/paddle/fluid/operators/graph_send_recv_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/graph_send_recv_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -21,59 +24,6 @@ class GraphSendRecvOP : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GraphSendRecv"); - OP_INOUT_CHECK(ctx->HasInput("Src_index"), "Input", "Src_index", - "GraphSendRecv"); - OP_INOUT_CHECK(ctx->HasInput("Dst_index"), "Input", "Dst_index", - "GraphSendRecv"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GraphSendRecv"); - - auto src_index_dims = ctx->GetInputDim("Src_index"); - if (src_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(src_index_dims[1], 1, - platform::errors::InvalidArgument( - "The last dim of Src_index should be 1 when it " - "is 2D, but we get %d", - src_index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - src_index_dims.size(), 1, - platform::errors::InvalidArgument( - "The Src_index should be 1D, when it is not 2D, but we get %d", - src_index_dims.size())); - } - - auto dst_index_dims = ctx->GetInputDim("Dst_index"); - if (dst_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(dst_index_dims[1], 1, - platform::errors::InvalidArgument( - "The last dim of Dst_index should be 1 when it " - "is 2D, but we get %d", - dst_index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - dst_index_dims.size(), 1, - platform::errors::InvalidArgument("The Dst_index should be 1D, " - "when it is not 2D, but we get %d", - dst_index_dims.size())); - } - - PADDLE_ENFORCE_EQ( - src_index_dims[0], dst_index_dims[0], - platform::errors::InvalidArgument( - "Src_index and Dst_index should have the same shape.")); - - auto dims = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", dims); - - if (ctx->Attrs().Get("pool_type") == "MEAN") { - OP_INOUT_CHECK(ctx->HasOutput("Dst_count"), "Output", "Dst_count", - "GraphSendRecv"); - ctx->SetOutputDim("Dst_count", {dims[0]}); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -164,20 +114,12 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +DECLARE_INFER_SHAPE_FUNCTOR(graph_send_recv, GraphSendRecvInferShapeFunctor, + PD_INFER_META(phi::GraphSendRecvInferMeta)); REGISTER_OPERATOR(graph_send_recv, ops::GraphSendRecvOP, ops::GraphSendRecvOpMaker, ops::GraphSendRecvGradOpMaker, - ops::GraphSendRecvGradOpMaker); + ops::GraphSendRecvGradOpMaker, + GraphSendRecvInferShapeFunctor); REGISTER_OPERATOR(graph_send_recv_grad, ops::GraphSendRecvGradOp); -REGISTER_OP_CPU_KERNEL(graph_send_recv, ops::GraphSendRecvOpKernel, - ops::GraphSendRecvOpKernel, - ops::GraphSendRecvOpKernel, - ops::GraphSendRecvOpKernel); - -REGISTER_OP_CPU_KERNEL(graph_send_recv_grad, - ops::GraphSendRecvGradOpKernel, - ops::GraphSendRecvGradOpKernel, - ops::GraphSendRecvGradOpKernel, - ops::GraphSendRecvGradOpKernel); diff --git a/paddle/fluid/operators/graph_send_recv_op.cu b/paddle/fluid/operators/graph_send_recv_op.cu deleted file mode 100644 index f43d31814ac38430d2d473eeca548b63e1a5c1fa..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/graph_send_recv_op.cu +++ /dev/null @@ -1,419 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/graph_send_recv_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -struct GraphSendRecvSumCUDAFunctor { - DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, - const IndexT& out_i) { - paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i)); - } -}; - -template -struct GraphSendRecvMaxCUDAFunctor { - DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, - const IndexT& out_i) { - paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i)); - } -}; - -template -struct GraphSendRecvMinCUDAFunctor { - DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, - const IndexT& out_i) { - paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i)); - } -}; - -template -__global__ void GraphSendRecvCUDAKernel(const T* params, - const IndexT* src_indices, - const IndexT* dst_indices, T* output, - size_t index_size, size_t slice_size, - Functor functor) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; - IndexT src_i = src_indices[indices_i]; - IndexT dst_i = dst_indices[indices_i]; - int64_t in_i = src_i * slice_size + slice_i; - int64_t out_i = dst_i * slice_size + slice_i; - functor(params, output, in_i, out_i); - } -} - -// For max -template -__global__ void InputResetMaxCUDAKernel(T* output, size_t input_size, - size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - if (*(output + i) == std::numeric_limits::min()) { - *(output + i) = 0; - } - } -} - -// For min -template -__global__ void InputResetMinCUDAKernel(T* output, size_t input_size, - size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - if (*(output + i) == std::numeric_limits::max()) { - *(output + i) = 0; - } - } -} - -// Get dst_count -template -__global__ void ComputeCountCUDAKernel(int* count, const IndexT* dst_indices, - size_t index_size) { - CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) { - IndexT dst_i = dst_indices[i]; - paddle::platform::CudaAtomicAdd(count + dst_i, 1); - } -} - -// For forward mean -template -__global__ void ManipulateMeanCUDAKernel(T* output, int* count, - size_t input_size, size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - int64_t c_index = i / slice_size; - if (*(count + c_index) > 1) { - *(output + i) = *(output + i) / *(count + c_index); - } - } -} - -// For backward mean -template -__global__ void ManipulateMeanGradCUDAKernel( - const T* params, const IndexT* src_indices, const IndexT* dst_indices, - T* output, size_t index_size, size_t slice_size, const int* dst_count) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; - IndexT src_i = src_indices[indices_i]; - IndexT dst_i = dst_indices[indices_i]; - int64_t in_i = src_i * slice_size + slice_i; - int64_t out_i = dst_i * slice_size + slice_i; - paddle::platform::CudaAtomicAdd(output + out_i, - *(params + in_i) / dst_count[src_i]); - } -} - -// For backward min and max -template -__global__ void ManipulateMinMaxGradCUDAKernel( - const T* params, const IndexT* src_indices, const IndexT* dst_indices, - T* output, size_t index_size, size_t slice_size, const T* ptr_input, - const T* ptr_output) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; - IndexT src_i = src_indices[indices_i]; - IndexT dst_i = dst_indices[indices_i]; - int64_t in_i = src_i * slice_size + slice_i; - int64_t out_i = dst_i * slice_size + slice_i; - paddle::platform::CudaAtomicAdd( - output + out_i, - *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i))); - } -} - -template -void GraphSendRecvOpCUDAKernelLaunchHelper( - const framework::ExecutionContext& ctx, const Tensor& src_index, - const Tensor& dst_index) { - auto* X = ctx.Input("X"); - auto* Y = ctx.Output("Out"); - std::string pool_type = ctx.Attr("pool_type"); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) { - memset_size *= src_dims[i]; - } - const size_t& memset_bytes = memset_size * sizeof(T); - if (pool_type == "SUM" || pool_type == "MEAN") { -#ifdef PADDLE_WITH_HIP - hipMemset(p_output, 0, memset_bytes); -#else - cudaMemset(p_output, 0, memset_bytes); -#endif - } else if (pool_type == "MAX") { - thrust::device_ptr p_output_ptr(p_output); - thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, - std::numeric_limits::min()); - } else if (pool_type == "MIN") { - thrust::device_ptr p_output_ptr(p_output); - thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, - std::numeric_limits::max()); - } - - if (index_size == 0) return; - - int64_t slice_size = 1; - for (int i = 1; i < src_dims.size(); ++i) { - slice_size *= src_dims[i]; - } - const T* p_src = X->data(); - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index.data(); - -#ifdef PADDLE_WITH_HIP - int block = 256; -#else - int block = 1024; -#endif - int64_t n = slice_size * index_size; - const auto& dev_ctx = ctx.cuda_device_context(); - int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; - int64_t grid_tmp = (n + block - 1) / block; - int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; - int64_t input_size = src_dims[0]; - if (pool_type == "SUM") { - GraphSendRecvSumCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - } else if (pool_type == "MAX") { - GraphSendRecvMaxCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block; - int64_t grid_max = - grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx; - InputResetMaxCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, input_size, slice_size); - } else if (pool_type == "MIN") { - GraphSendRecvMinCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block; - int64_t grid_min = - grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx; - InputResetMinCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, input_size, slice_size); - } else if (pool_type == "MEAN") { - GraphSendRecvSumCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - auto* dst_count = ctx.Output("Dst_count"); - int* p_dst_count = dst_count->mutable_data(ctx.GetPlace()); - -#ifdef PADDLE_WITH_HIP - hipMemset(p_dst_count, 0, input_size * sizeof(int)); -#else - cudaMemset(p_dst_count, 0, input_size * sizeof(int)); -#endif - - int64_t grid_count = (index_size + block - 1) / block; - ComputeCountCUDAKernel< - T, IndexT><<( - ctx.device_context()) - .stream()>>>(p_dst_count, d_index, index_size); - - int64_t grid_mean_tmp = (input_size * slice_size + block - 1) / block; - int64_t grid_mean = - grid_mean_tmp < max_grid_dimx ? grid_mean_tmp : max_grid_dimx; - ManipulateMeanCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, p_dst_count, input_size, slice_size); - } -} - -template -void GraphSendRecvGradOpCUDAKernelLaunchHelper( - const framework::ExecutionContext& ctx, const Tensor& src_index, - const Tensor& dst_index) { - auto* X = ctx.Input(framework::GradVarName("Out")); - auto* Y = ctx.Output(framework::GradVarName("X")); - std::string pool_type = ctx.Attr("pool_type"); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) { - memset_size *= src_dims[i]; - } - const size_t& memset_bytes = memset_size * sizeof(T); - -#ifdef PADDLE_WITH_HIP - hipMemset(p_output, 0, memset_bytes); -#else - cudaMemset(p_output, 0, memset_bytes); -#endif - - if (index_size == 0) return; - - int64_t slice_size = 1; - for (int i = 1; i < src_dims.size(); ++i) { - slice_size *= src_dims[i]; - } - const T* p_src = X->data(); - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index.data(); - -#ifdef PADDLE_WITH_HIP - int block = 256; -#else - int block = 1024; -#endif - int64_t n = slice_size * index_size; - const auto& dev_ctx = ctx.cuda_device_context(); - int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; - int64_t grid_tmp = (n + block - 1) / block; - int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; - int64_t input_size = src_dims[0]; - if (pool_type == "SUM") { - GraphSendRecvSumCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Input("Dst_count"); - const int* s_count = dst_count->data(); - ManipulateMeanGradCUDAKernel<<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, s_count); - } else if (pool_type == "MAX" || pool_type == "MIN") { - auto* input = ctx.Input("X"); - auto* output = ctx.Input("Out"); - const T* ptr_input = input->data(); - const T* ptr_output = output->data(); - ManipulateMinMaxGradCUDAKernel<<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, ptr_input, - ptr_output); - } -} - -template -class GraphSendRecvOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Src_index"); - auto* dst_index = ctx.Input("Dst_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index dtype, expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -template -class GraphSendRecvGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Dst_index"); - auto* dst_index = ctx.Input("Src_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvGradOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvGradOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index dtype, expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -} // namespace operators -} // namespace paddle - -using CUDA = paddle::platform::CUDADeviceContext; -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL(graph_send_recv, - ops::GraphSendRecvOpCUDAKernel, - ops::GraphSendRecvOpCUDAKernel, - ops::GraphSendRecvOpCUDAKernel, - ops::GraphSendRecvOpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL(graph_send_recv_grad, - ops::GraphSendRecvGradOpCUDAKernel, - ops::GraphSendRecvGradOpCUDAKernel, - ops::GraphSendRecvGradOpCUDAKernel, - ops::GraphSendRecvGradOpCUDAKernel); diff --git a/paddle/fluid/operators/graph_send_recv_op.h b/paddle/fluid/operators/graph_send_recv_op.h deleted file mode 100644 index 8d8111e0ee845bf6828ee53459e6d86bdebba484..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/graph_send_recv_op.h +++ /dev/null @@ -1,291 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -struct GraphSendRecvSumFunctor { - void operator()(const bool& first_flag, const Tensor& src_slice, - Tensor* dst_slice) { - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); - eigen_dst += eigen_src; - } -}; - -template -struct GraphSendRecvMinFunctor { - void operator()(const bool& first_flag, const Tensor& src_slice, - Tensor* dst_slice) { - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); - if (first_flag) { - eigen_dst += eigen_src; - } else { - eigen_dst = eigen_dst.cwiseMin(eigen_src); - } - } -}; - -template -struct GraphSendRecvMaxFunctor { - void operator()(const int& first_flag, const Tensor& src_slice, - Tensor* dst_slice) { - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); - if (first_flag) { - eigen_dst += eigen_src; - } else { - eigen_dst = eigen_dst.cwiseMax(eigen_src); - } - } -}; - -template -void elementwise_inner_operation(const Tensor& src, Tensor* dst, - const IndexT& src_index, - const IndexT& dst_index, - const bool& first_flag, Functor functor) { - auto src_slice = src.Slice(src_index, src_index + 1); - auto dst_slice = dst->Slice(dst_index, dst_index + 1); - - functor(first_flag, src_slice, &dst_slice); -} - -template -void graph_send_recv_cpu_for_loop(const int& input_size, const int& index_size, - const IndexT* s_index, const IndexT* d_index, - const Tensor& src, Tensor* dst, - const std::string& pool_type, - int* dst_count = nullptr) { - Functor functor; - if (pool_type == "SUM") { - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - elementwise_inner_operation(src, dst, src_idx, - dst_idx, false, functor); - } - } else if (pool_type == "MEAN") { - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - elementwise_inner_operation(src, dst, src_idx, - dst_idx, false, functor); - } - for (int i = 0; i < index_size; ++i) { - IndexT dst_idx = d_index[i]; - *(dst_count + dst_idx) += 1; - } - for (int i = 0; i < input_size; ++i) { - if (*(dst_count + i) == 0) continue; - auto dst_slice = dst->Slice(i, i + 1); - auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst = eigen_dst / static_cast(*(dst_count + i)); - } - } else if (pool_type == "MIN" || pool_type == "MAX") { - std::set existed_dst; - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - bool in_set = existed_dst.find(dst_idx) != existed_dst.end(); - if (!in_set) { - elementwise_inner_operation(src, dst, src_idx, - dst_idx, true, functor); - existed_dst.emplace(dst_idx); - } else { - elementwise_inner_operation( - src, dst, src_idx, dst_idx, false, functor); - } - } - } -} - -template -void graph_send_recv_cpu_for_loop_grad( - const int& input_size, const int& index_size, const IndexT* s_index, - const IndexT* d_index, const Tensor& src, Tensor* dst, - const std::string& pool_type, const int* dst_count = nullptr, - const Tensor* input = nullptr, const Tensor* output = nullptr) { - if (pool_type == "SUM") { - Functor functor; - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - elementwise_inner_operation(src, dst, src_idx, - dst_idx, false, functor); - } - } else if (pool_type == "MEAN") { - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - auto src_slice = src.Slice(src_idx, src_idx + 1); - auto dst_slice = dst->Slice(dst_idx, dst_idx + 1); - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst += (eigen_src / static_cast(dst_count[src_idx])); - } - } else if (pool_type == "MIN" || pool_type == "MAX") { - for (int i = 0; i < index_size; ++i) { - const IndexT& forward_src_idx = d_index[i]; - const IndexT& forward_dst_idx = s_index[i]; - auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1); - auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1); - auto eigen_input = framework::EigenVector::Flatten(input_slice); - auto eigen_output = framework::EigenVector::Flatten(output_slice); - - auto src_slice = src.Slice(forward_dst_idx, forward_dst_idx + 1); - auto dst_slice = dst->Slice(forward_src_idx, forward_src_idx + 1); - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst += eigen_src * (eigen_output == eigen_input); - } - } -} - -template -void GraphSendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx, - const Tensor& src_index) { - auto* X = ctx.Input("X"); - auto* dst_index = ctx.Input("Dst_index"); - auto* Y = ctx.Output("Out"); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; - const size_t& memset_bytes = memset_size * sizeof(T); - memset(p_output, 0, memset_bytes); - - if (index_size == 0) return; - - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index->data(); - const std::string& pool_type = ctx.Attr("pool_type"); - if (pool_type == "SUM") { - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MIN") { - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MAX") { - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Output("Dst_count"); - int* p_dst_count = dst_count->mutable_data(ctx.GetPlace()); - memset(p_dst_count, 0, src_dims[0] * sizeof(int)); - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, - p_dst_count); - } -} - -template -void GraphSendRecvGradOpKernelLaunchHelper( - const framework::ExecutionContext& ctx, const Tensor& src_index) { - auto* X = ctx.Input(framework::GradVarName("Out")); - auto* dst_index = ctx.Input("Src_index"); - auto* Y = ctx.Output(framework::GradVarName("X")); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; - const size_t& memset_bytes = memset_size * sizeof(T); - memset(p_output, 0, memset_bytes); - - if (index_size == 0) return; - - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index->data(); - - const std::string& pool_type = ctx.Attr("pool_type"); - if (pool_type == "SUM") { - graph_send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Input("Dst_count"); - const int* s_count = dst_count->data(); - // Functor not used here. - graph_send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, s_count); - } else if (pool_type == "MIN" || pool_type == "MAX") { - const auto* input = ctx.Input("X"); - const auto* output = ctx.Input("Out"); - // Functor not used here. - graph_send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, nullptr, - input, output); - } -} - -template -class GraphSendRecvOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Src_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvOpKernelLaunchHelper(ctx, *src_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvOpKernelLaunchHelper(ctx, - *src_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index type, Expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -template -class GraphSendRecvGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Dst_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvGradOpKernelLaunchHelper(ctx, - *src_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvGradOpKernelLaunchHelper( - ctx, *src_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index type, Expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h index 8f3c6660f51c4de80e5a98370eae0381abe333a6..93e96694270a458844bbcabf78f2559975902c2f 100644 --- a/paddle/fluid/operators/grid_sampler_op.h +++ b/paddle/fluid/operators/grid_sampler_op.h @@ -18,7 +18,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index 72a90d17998d84f0d0d4e081543acae94756e635..b376334f1e93cc3be9e716d808525011edb29b94 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -29,6 +29,7 @@ namespace operators { using DataLayout = framework::DataLayout; enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 }; +#define ALIGN_BYTES 16 #define CHECK_CASE(i, flags, kernel_name, ...) \ if (i == flags) { \ @@ -56,8 +57,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) { template __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W, int imsize, int groups, - int group_size, T* mean, T* var, - const DataLayout data_layout) { + int group_size, T* mean, T* var) { int gid = blockIdx.y; int cid = blockIdx.x; int bid = blockIdx.z; @@ -68,13 +68,10 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W, T x_mean = 0, x_var = 0; for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { T val; - if (data_layout == DataLayout::kNCHW) { - val = x[(bid * C + ccid) * imsize + imid]; - } else { - int hid = imid / W; - int wid = imid % W; - val = x[(bid * H + hid) * W * C + wid * C + ccid]; - } + int hid = imid / W; + int wid = imid % W; + val = x[(bid * H + hid) * W * C + wid * C + ccid]; + x_mean += val; x_var += val * val; } @@ -84,6 +81,85 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W, CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var); } +template +__device__ __forceinline__ void ThreadReduce(const T* input, int size, + const int offset, AccT* mean, + AccT* var) { + using VecT = kps::details::VectorType; + int tid = threadIdx.x; + if (offset > 0) { + input -= offset; + size += offset; + if (tid >= offset) { + AccT temp = input[tid]; + *mean += temp; + *var += temp * temp; + } + size -= blockDim.x; + input += blockDim.x; + } + int remain = size % (VecSize * blockDim.x); + + T ins[VecSize]; + VecT* ins_vec = reinterpret_cast(&ins); + + // vector part + for (; VecSize * tid < (size - remain); tid += blockDim.x) { + *ins_vec = reinterpret_cast(input)[tid]; + +#pragma unroll + for (int i = 0; i < VecSize; ++i) { + AccT temp = ins[i]; + *mean += temp; + *var += temp * temp; + } + } + + // scalar part + tid = size - remain + threadIdx.x; + for (; tid < size; tid += blockDim.x) { + AccT temp = input[tid]; + *mean += temp; + *var += temp * temp; + } +} + +template +__global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) { + int i = blockIdx.x; + T x_mean = 0, x_var = 0; + for (int j = threadIdx.x; j < size; j += blockDim.x) { + T val; + val = x[i * size + j]; + x_mean += val; + x_var += val * val; + } + x_mean /= size; + x_var /= size; + CudaAtomicAddWithWarp(&mean[i], x_mean); + CudaAtomicAddWithWarp(&var[i], x_var); +} + +template +__global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var, + int size) { + int i = blockIdx.x; + AccT x_mean = static_cast(0); + AccT x_var = static_cast(0); + const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T); + x += i * size; + ThreadReduce(x, size, input_offset, &x_mean, &x_var); + x_mean = kps::details::BlockXReduce>( + x_mean, kps::AddFunctor()); + x_var = kps::details::BlockXReduce>( + x_var, kps::AddFunctor()); + __syncthreads(); + if (threadIdx.x == 0) { + mean[i] = static_cast(x_mean / size); + var[i] = static_cast(x_var / size); + } +} + template __global__ void GroupNormForward(const T* x, const T* mean, const T* var, const T* scale, const T* bias, int N, int C, @@ -96,26 +172,34 @@ __global__ void GroupNormForward(const T* x, const T* mean, const T* var, int H = imsize / W; int ccid = gid * group_size + cid; if (ccid >= C) return; - T x_mean = mean[bid * groups + gid]; - T x_var = var[bid * groups + gid]; + auto ng = bid * groups + gid; + T x_mean = mean[ng]; + T x_var = var[ng]; x_var = x_var - x_mean * x_mean; - T var_inv = 1.0 / sqrt(x_var + epsilon); - if (cid == 0 && threadIdx.x == 0) real_var[bid * groups + gid] = x_var; + T var_inv = rsqrt(x_var + epsilon); + if (cid == 0 && threadIdx.x == 0) { + real_var[ng] = x_var; + } for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { T val; int hid, wid; + int index = (bid * C + ccid) * imsize + imid; if (data_layout == DataLayout::kNCHW) { - val = x[(bid * C + ccid) * imsize + imid]; + val = x[index]; } else { hid = imid / W; wid = imid % W; val = x[(bid * H + hid) * W * C + wid * C + ccid]; } val = (val - x_mean) * var_inv; - if (flags & kHasScale) val *= scale[gid * group_size + cid]; - if (flags & kHasBias) val += bias[gid * group_size + cid]; + if (flags & kHasScale) { + val *= scale[ccid]; + } + if (flags & kHasBias) { + val += bias[ccid]; + } if (data_layout == DataLayout::kNCHW) { - y[(bid * C + ccid) * imsize + imid] = val; + y[index] = val; } else { y[(bid * H + hid) * W * C + wid * C + ccid] = val; } @@ -182,16 +266,41 @@ class GroupNormKernel imsize *= x_dims[i]; } } + #ifdef __HIPCC__ int block_size = std::max(std::min(256, imsize), 64); #else int block_size = std::min(1024, imsize); #endif + dim3 grid(group_size, groups, x_dims[0]); dim3 threads(block_size, 1, 1); - GroupNormForwardGetMeanAndVar<<>>( - x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data, - temp_var_data, data_layout); + if (data_layout == DataLayout::kNCHW) { + using AccT = typename details::MPTypeTrait::Type; + constexpr int vec_size = sizeof(float4) / sizeof(T); + int size = group_size * imsize; + const int max_num_threads = 1024; + int max_block_size = std::min(size / vec_size, max_num_threads); + int block_size_nchw = 1; + while (block_size_nchw < max_block_size) { + block_size_nchw *= 2; + } + block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize); + dim3 grids(x_dims[0] * groups); + dim3 blocks(block_size_nchw); + if (size < vec_size) { + ScalarGetMeanAndVarNCHW<<>>( + x_data, mean_data, temp_var_data, size); + } else { + VectorizedGetMeanAndVarNCHW< + T, AccT, vec_size><<>>( + x_data, mean_data, temp_var_data, size); + } + } else { + GroupNormForwardGetMeanAndVar<<>>( + x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data, + temp_var_data); + } int flags = (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias; UNROLL_ALL_CASES(flags, GroupNormForward, x_data, mean_data, temp_var_data, diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc index 88530b5352d31df7fac6eb122867f275777e40f6..d7cf03ddd6189393d16281b434c4dd5b4984e923 100644 --- a/paddle/fluid/operators/gru_op.cc +++ b/paddle/fluid/operators/gru_op.cc @@ -15,9 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/gru_op.h" #include #include -#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h" -#include "paddle/fluid/operators/math/detail/gru_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h" +#include "paddle/phi/kernels/funcs/detail/gru_kernel.h" DECLARE_int32(paddle_num_threads); @@ -316,7 +316,7 @@ class GRUCPUKernel : public framework::OpKernel { batch_hidden->mutable_data(context.GetPlace()); bool is_reverse = context.Attr("is_reverse"); - math::LoDTensor2BatchFunctor to_batch; + phi::funcs::LoDTensor2BatchFunctor to_batch; auto& dev_ctx = context.template device_context(); to_batch(dev_ctx, *input, batch_gate, true, is_reverse); @@ -326,7 +326,7 @@ class GRUCPUKernel : public framework::OpKernel { } int frame_size = hidden_dims[1]; - math::GRUMetaValue gru_value; + phi::funcs::GRUMetaValue gru_value; gru_value.gate_weight = const_cast(weight_data); gru_value.state_weight = const_cast(weight_data + 2 * frame_size * frame_size); @@ -347,9 +347,9 @@ class GRUCPUKernel : public framework::OpKernel { } auto batch_starts = batch_gate->lod()[0]; size_t seq_len = batch_starts.size() - 1; - auto active_node = math::detail::GetActivationType( + auto active_node = phi::funcs::detail::GetActivationType( context.Attr("activation")); - auto active_gate = math::detail::GetActivationType( + auto active_gate = phi::funcs::detail::GetActivationType( context.Attr("gate_activation")); #ifdef PADDLE_WITH_MKLML @@ -396,9 +396,9 @@ class GRUCPUKernel : public framework::OpKernel { frame_size * 2, T(1), gru_value.gate_value, frame_size * 3); } - math::detail::forward_reset_output( - math::detail::forward::gru_resetOutput(), gru_value, frame_size, - cur_batch_size, active_gate); + phi::funcs::detail::forward_reset_output( + phi::funcs::detail::forward::gru_resetOutput(), gru_value, + frame_size, cur_batch_size, active_gate); if (gru_value.prev_out_value) { blas.GEMM_COMPUTE( @@ -408,9 +408,9 @@ class GRUCPUKernel : public framework::OpKernel { frame_size * 3); } - math::detail::forward_final_output( - math::detail::forward::gru_finalOutput(), gru_value, frame_size, - cur_batch_size, active_node, origin_mode); + phi::funcs::detail::forward_final_output( + phi::funcs::detail::forward::gru_finalOutput(), gru_value, + frame_size, cur_batch_size, active_node, origin_mode); gru_value.prev_out_value = gru_value.output_value; } @@ -432,7 +432,7 @@ class GRUCPUKernel : public framework::OpKernel { gru_value.gate_value = gate_t.data(); gru_value.reset_output_value = reset_hidden_prev_t.data(); - math::GRUUnitFunctor::compute( + phi::funcs::GRUUnitFunctor::compute( dev_ctx, gru_value, frame_size, cur_batch_size, active_node, active_gate, origin_mode); @@ -441,7 +441,7 @@ class GRUCPUKernel : public framework::OpKernel { #ifdef PADDLE_WITH_MKLML } #endif - math::Batch2LoDTensorFunctor to_seq; + phi::funcs::Batch2LoDTensorFunctor to_seq; batch_hidden->set_lod(batch_gate->lod()); to_seq(dev_ctx, *batch_hidden, hidden); } diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc index 7d055240916f621d90a3496ee241d9348e88b71d..5be0acc15432c896872a70e0a87949faea496a42 100644 --- a/paddle/fluid/operators/gru_op.cu.cc +++ b/paddle/fluid/operators/gru_op.cu.cc @@ -65,7 +65,7 @@ class GRUKernel : public framework::OpKernel { batch_hidden->mutable_data(context.GetPlace()); bool is_reverse = context.Attr("is_reverse"); - math::LoDTensor2BatchFunctor to_batch; + phi::funcs::LoDTensor2BatchFunctor to_batch; auto& dev_ctx = context.template device_context(); to_batch(dev_ctx, *input, batch_gate, true, is_reverse); @@ -75,7 +75,7 @@ class GRUKernel : public framework::OpKernel { } int frame_size = hidden_dims[1]; - math::GRUMetaValue gru_value; + phi::funcs::GRUMetaValue gru_value; gru_value.gate_weight = const_cast(weight_data); gru_value.state_weight = const_cast(weight_data + 2 * frame_size * frame_size); @@ -96,9 +96,9 @@ class GRUKernel : public framework::OpKernel { } auto batch_starts = batch_gate->lod()[0]; size_t num_batch = batch_starts.size() - 1; - auto active_node = math::detail::GetActivationType( + auto active_node = phi::funcs::detail::GetActivationType( context.Attr("activation")); - auto active_gate = math::detail::GetActivationType( + auto active_gate = phi::funcs::detail::GetActivationType( context.Attr("gate_activation")); for (size_t n = 0; n < num_batch; n++) { int bstart = static_cast(batch_starts[n]); @@ -111,13 +111,13 @@ class GRUKernel : public framework::OpKernel { gru_value.output_value = hidden_t.data(); gru_value.gate_value = gate_t.data(); gru_value.reset_output_value = reset_hidden_prev_t.data(); - math::GRUUnitFunctor::compute( + phi::funcs::GRUUnitFunctor::compute( dev_ctx, gru_value, frame_size, cur_batch_size, active_node, active_gate, origin_mode); gru_value.prev_out_value = gru_value.output_value; } - math::Batch2LoDTensorFunctor to_seq; + phi::funcs::Batch2LoDTensorFunctor to_seq; batch_hidden->set_lod(batch_gate->lod()); to_seq(dev_ctx, *batch_hidden, hidden); } diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h index 130b10c7390110770336099c3ac64966389441eb..852655034c8c277f7e7bf1fb562951c26223c101 100644 --- a/paddle/fluid/operators/gru_op.h +++ b/paddle/fluid/operators/gru_op.h @@ -16,10 +16,10 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/detail/activation_functions.h" -#include "paddle/fluid/operators/math/gru_compute.h" -#include "paddle/fluid/operators/math/sequence2batch.h" +#include "paddle/phi/kernels/funcs/detail/activation_functions.h" +#include "paddle/phi/kernels/funcs/gru_compute.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/sequence2batch.h" namespace paddle { namespace operators { @@ -32,7 +32,7 @@ inline void ReorderInitState(const DeviceContext& ctx, const framework::Tensor& src, framework::Vector index_lod, framework::Tensor* dst, bool indexed_src) { - math::CopyMatrixRowsFunctor row_shuffle; + phi::funcs::CopyMatrixRowsFunctor row_shuffle; dst->mutable_data(src.dims(), ctx.GetPlace()); row_shuffle(ctx, src, index_lod, dst, indexed_src); } @@ -63,7 +63,7 @@ class GRUGradKernel : public framework::OpKernel { auto hidden_dims = hidden->dims(); int frame_size = hidden_dims[1]; - math::LoDTensor2BatchFunctor to_batch; + phi::funcs::LoDTensor2BatchFunctor to_batch; LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad; batch_hidden_grad.mutable_data(hidden_dims, context.GetPlace()); batch_gate_grad.mutable_data(gate_dims, context.GetPlace()); @@ -93,12 +93,12 @@ class GRUGradKernel : public framework::OpKernel { batch_hidden_grad.set_lod(batch_hidden->lod()); to_batch(dev_ctx, *hidden_grad, &batch_hidden_grad, false, is_reverse); - math::GRUMetaValue gru_value; + phi::funcs::GRUMetaValue gru_value; gru_value.gate_weight = const_cast(weight_data); gru_value.state_weight = const_cast(weight_data + 2 * frame_size * frame_size); - math::GRUMetaGrad gru_grad; + phi::funcs::GRUMetaGrad gru_grad; if (weight_grad) { gru_grad.gate_weight_grad = weight_grad->mutable_data(context.GetPlace()); @@ -112,9 +112,9 @@ class GRUGradKernel : public framework::OpKernel { auto batch_starts = batch_hidden_grad.lod()[0]; size_t num_batch = batch_starts.size() - 1; - auto active_node = math::detail::GetActivationType( + auto active_node = phi::funcs::detail::GetActivationType( context.Attr("activation")); - auto active_gate = math::detail::GetActivationType( + auto active_gate = phi::funcs::detail::GetActivationType( context.Attr("gate_activation")); for (int n = static_cast(num_batch) - 1; n >= 0; n--) { int bstart = static_cast(batch_starts[n]); @@ -145,13 +145,13 @@ class GRUGradKernel : public framework::OpKernel { gru_grad.prev_out_grad = hidden_prev_grad_t.data(); } gru_value.output_value = nullptr; - math::GRUUnitGradFunctor::compute( + phi::funcs::GRUUnitGradFunctor::compute( dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, active_node, active_gate, origin_mode); } if (input_grad) { input_grad->mutable_data(context.GetPlace()); - math::Batch2LoDTensorFunctor to_seq; + phi::funcs::Batch2LoDTensorFunctor to_seq; batch_gate_grad.set_lod(batch_gate->lod()); to_seq(dev_ctx, batch_gate_grad, input_grad); } diff --git a/paddle/fluid/operators/gumbel_softmax_op.cc b/paddle/fluid/operators/gumbel_softmax_op.cc index f8f8f3fd789ad61a99bcc17bc073b6cfd099f639..524f2d6c9d719468876d8a586b6eea13f99a7b79 100644 --- a/paddle/fluid/operators/gumbel_softmax_op.cc +++ b/paddle/fluid/operators/gumbel_softmax_op.cc @@ -90,11 +90,11 @@ class GumbelSoftmaxGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax, GumbelSoftmaxInferShapeFunctor, - PT_INFER_META(phi::GumbelSoftmaxInferMeta)); -DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax_grad, +DECLARE_INFER_SHAPE_FUNCTOR(gumbel_softmax, GumbelSoftmaxInferShapeFunctor, + PD_INFER_META(phi::GumbelSoftmaxInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(gumbel_softmax_grad, GumbelSoftmaxGradInferShapeFunctor, - PT_INFER_META(phi::GumbelSoftmaxGradInferMeta)); + PD_INFER_META(phi::GumbelSoftmaxGradInferMeta)); REGISTER_OPERATOR(gumbel_softmax, ops::GumbelSoftmaxOp, ops::GumbelSoftmaxOpMaker, diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc index 3915ce5809c394738c58e80accccac531c268c23..3c9bbc753f29b1cf104a085d340ddc75cf2790f8 100644 --- a/paddle/fluid/operators/huber_loss_op.cc +++ b/paddle/fluid/operators/huber_loss_op.cc @@ -112,8 +112,8 @@ class HuberLossGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(huber_loss, HuberLossInferShapeFunctor, - PT_INFER_META(phi::HuberLossInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(huber_loss, HuberLossInferShapeFunctor, + PD_INFER_META(phi::HuberLossInferMeta)); REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker, ops::HuberLossGradOpMaker, diff --git a/paddle/fluid/operators/imag_op.cc b/paddle/fluid/operators/imag_op.cc index 33b68d68992dd819f74c2ae67153ecc6b050b16b..16968876ac96cac2fa1b009ea40b939f1e11a953 100644 --- a/paddle/fluid/operators/imag_op.cc +++ b/paddle/fluid/operators/imag_op.cc @@ -82,8 +82,8 @@ DECLARE_INPLACE_OP_INFERER(ImagGradOpInplaceInferer, } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor, + PD_INFER_META(phi::RealAndImagInferMeta)); namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc index 105d818e197434c4ed85126228e06d45bf06e498..e2efaa1759b008dd0055bb6e06917cbd4fc1932f 100644 --- a/paddle/fluid/operators/increment_op.cc +++ b/paddle/fluid/operators/increment_op.cc @@ -87,8 +87,8 @@ class IncrementGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(increment, IncrementInferShapeFunctor, - PT_INFER_META(phi::IncrementInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(increment, IncrementInferShapeFunctor, + PD_INFER_META(phi::IncrementInferMeta)); REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker, ops::IncrementGradOpMaker, ops::IncrementGradOpMaker, diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc index 09f4e63943ad3784a598524273831bf875ed9213..8324a6215bca8145ba36dabb3d8108006a57e829 100644 --- a/paddle/fluid/operators/increment_op_npu_test.cc +++ b/paddle/fluid/operators/increment_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/index_impl.cu.h b/paddle/fluid/operators/index_impl.cu.h index 2e3e6569ef5a88f8dfcb6646974b70bcc6c0c95f..bb26e2f445e7034b8f982594216eacfd3007a24f 100644 --- a/paddle/fluid/operators/index_impl.cu.h +++ b/paddle/fluid/operators/index_impl.cu.h @@ -19,11 +19,11 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/distribution_helper.h" #include "paddle/fluid/operators/fill_constant_op.h" -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" #include "paddle/phi/kernels/primitive/kernel_primitives.h" namespace paddle { @@ -58,7 +58,7 @@ void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) { int numel = out->numel(); T *out_data = out->mutable_data(dev_ctx.GetPlace()); if (numel <= 0) return; - int vec_size = paddle::platform::GetVectorizedSize(out_data); + int vec_size = phi::GetVectorizedSize(out_data); #ifdef PADDLE_WITH_XPU_KP int block = 64; int grid = 8; diff --git a/paddle/fluid/operators/index_sample_op.cc b/paddle/fluid/operators/index_sample_op.cc index 2d97797cfec21ed50f0999fa13f8bb1ae9618b71..d17c6368c7537b93ceb6f1d75b6d73467bd207ac 100644 --- a/paddle/fluid/operators/index_sample_op.cc +++ b/paddle/fluid/operators/index_sample_op.cc @@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/index_sample_op.h" #include #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" -#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker { @@ -42,44 +44,6 @@ class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker { class IndexSampleOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Inputs(Input) of FindByIndex should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true, - platform::errors::InvalidArgument( - "Inputs(Index) of FindByIndex should not be null.")); - - auto input_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ( - input_dims.size(), 2, - platform::errors::InvalidArgument( - "Inputs(X) shape of IndexSample op should be 2-D, but " - "got X's shape = [%s], please check X shape.", - input_dims)); - - auto index_dims = ctx->GetInputDim("Index"); - PADDLE_ENFORCE_EQ( - input_dims.size(), 2, - platform::errors::InvalidArgument( - "Inputs(Index) shape of IndexSample op should be 2-D, but " - "got Index's shape [%s] , please check index shape.", - input_dims)); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(input_dims[0], index_dims[0], - platform::errors::InvalidArgument( - "Inputs(X)'s value of dimension 0 must same with " - "Inputs(Index)'s value of dimension 0, but " - "got %d of Inputs(X), and got %d of Inputs(Index), " - "please check Inputs shape.", - input_dims[0], index_dims[0])); - } - ctx->SetOutputDim("Out", index_dims); - auto type = ctx->GetInputsVarType("Index")[0]; - if (type == framework::proto::VarType::LOD_TENSOR) { - ctx->ShareLoD("Index", /*->*/ "Out"); - } - } protected: framework::OpKernelType GetExpectedKernelType( @@ -136,20 +100,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSampleGradNoNeedBufferVarInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor, + PD_INFER_META(phi::IndexSampleInferMeta)); REGISTER_OPERATOR(index_sample, ops::IndexSampleOp, ops::IndexSampleOpMaker, ops::IndexSampleGradMaker, - ops::IndexSampleGradMaker); + ops::IndexSampleGradMaker, + IndexSampleInferShapeFunctor); REGISTER_OPERATOR(index_sample_grad, ops::IndexSampleGradOp, ops::IndexSampleGradNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL( - index_sample, - ops::IndexSampleKernel, - ops::IndexSampleKernel, - ops::IndexSampleKernel, - ops::IndexSampleKernel); -REGISTER_OP_CPU_KERNEL( - index_sample_grad, - ops::IndexSampleGradKernel, - ops::IndexSampleGradKernel, - ops::IndexSampleGradKernel, - ops::IndexSampleGradKernel); diff --git a/paddle/fluid/operators/index_sample_op.cu b/paddle/fluid/operators/index_sample_op.cu deleted file mode 100644 index e8acbfb8be990a422e5a16e8871d47f55af6620c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/index_sample_op.cu +++ /dev/null @@ -1,215 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/index_sample_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -#define PREDEFINED_BLOCK_SIZE_X 512 -#define PREDEFINED_BLOCK_SIZE 1024 -#define MIN(a, b) ((a) < (b) ? (a) : (b)) - -namespace paddle { -namespace operators { - -namespace { -void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) { - auto max_grid_dim = ctx.template device_context() - .GetCUDAMaxGridDimSize(); - grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0]; - grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1]; -} -} - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -template -__global__ void IndexSampleForward(const IndexT* index, const T* in_data, - T* out_data, size_t index_length, - size_t input_length, size_t batch_size) { - unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x; - unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y; - for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) { - index_i = blockDim.x * blockIdx.x + threadIdx.x; - for (; index_i < index_length; index_i += blockDim.x * gridDim.x) { - unsigned int index_idx = index_j * index_length + index_i; - unsigned int in_idx = index_j * input_length + index_i; - IndexT sample_idx = index[index_idx]; - out_data[index_idx] = in_data[in_idx - index_i + sample_idx]; - } - } -} - -template -__global__ void IndexSampleGrad(const IndexT* index, T* in_grad, - const T* out_grad, size_t index_length, - size_t input_length, size_t batch_size, - bool same_data_in_row = true) { - unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x; - unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y; - - for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) { - index_i = blockDim.x * blockIdx.x + threadIdx.x; - for (; index_i < index_length; index_i += blockDim.x * gridDim.x) { - unsigned int index_idx = index_j * index_length + index_i; - unsigned int in_idx = index_j * input_length + index_i; - IndexT sample_idx = index[index_idx]; - if (same_data_in_row) { - platform::CudaAtomicAdd(&(in_grad[in_idx - index_i + sample_idx]), - out_grad[sample_idx]); - } else { - in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx]; - } - } - } -} - -template -class IndexSampleKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* index = ctx.Input("Index"); - auto* output = ctx.Output("Out"); - - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT64 || - index_type == framework::proto::VarType::INT32; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - const auto* in_data = input->data(); - auto* out_data = output->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context().stream(); - - auto input_dim = input->dims(); - auto index_dim = index->dims(); - size_t batch_size = input_dim[0]; - size_t input_length = input_dim[1]; - size_t index_length = index_dim[1]; - - auto block_width = platform::RoundToPowerOfTwo(index_length); - block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X); - int block_height = - platform::RoundToPowerOfTwo(index_length * batch_size) / block_width; - block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width); - dim3 block_dim(block_width, block_height); - dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x, - (batch_size + block_dim.y - 1) / block_dim.y); - LimitGridDim(ctx, &grid_dim); - - if (index_type == framework::proto::VarType::INT64) { - const int64_t* index_data = index->data(); - IndexSampleForward<<>>( - index_data, in_data, out_data, index_length, input_length, - batch_size); - } else if (index_type == framework::proto::VarType::INT32) { - const int* index_data = index->data(); - IndexSampleForward<<>>( - index_data, in_data, out_data, index_length, input_length, - batch_size); - } - } -}; - -template -class IndexSampleGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* output_grad = ctx.Input(framework::GradVarName("Out")); - auto* input_grad = ctx.Output(framework::GradVarName("X")); - auto* index = ctx.Input("Index"); - - const auto* output_grad_data = output_grad->data(); - auto* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); - - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT64 || - index_type == framework::proto::VarType::INT32; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - auto stream = - ctx.template device_context().stream(); - auto input_num = input_grad->numel(); - auto input_dim = input_grad->dims(); - auto index_dim = index->dims(); - size_t batch_size = index_dim[0]; - size_t input_length = input_dim[1]; - size_t index_length = index_dim[1]; - bool same_data_in_index_row = index_length == 1 ? false : true; - - auto block_width = platform::RoundToPowerOfTwo(index_length); - block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X); - auto block_height = - platform::RoundToPowerOfTwo(index_length * batch_size) / block_width; - block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width); - dim3 block_dim(block_width, block_height); - dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x, - (batch_size + block_dim.y - 1) / block_dim.y); - LimitGridDim(ctx, &grid_dim); - - phi::funcs::SetConstant set_zero; - auto& dev_ctx = ctx.template device_context(); - set_zero(dev_ctx, input_grad, static_cast(0)); - - if (index_type == framework::proto::VarType::INT64) { - const int64_t* index_data = index->data(); - IndexSampleGrad<<>>( - index_data, input_grad_data, output_grad_data, index_length, - input_length, batch_size, same_data_in_index_row); - } else if (index_type == framework::proto::VarType::INT32) { - const int* index_data = index->data(); - IndexSampleGrad<<>>( - index_data, input_grad_data, output_grad_data, index_length, - input_length, batch_size, same_data_in_index_row); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - index_sample, - ops::IndexSampleKernel, - ops::IndexSampleKernel, - ops::IndexSampleKernel, - ops::IndexSampleKernel); -REGISTER_OP_CUDA_KERNEL( - index_sample_grad, - ops::IndexSampleGradKernel, - ops::IndexSampleGradKernel, - ops::IndexSampleGradKernel, - ops::IndexSampleGradKernel); diff --git a/paddle/fluid/operators/index_sample_op.h b/paddle/fluid/operators/index_sample_op.h deleted file mode 100644 index 6cc8ff04c544554e805c605783c9bedf1b9fcb7b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/index_sample_op.h +++ /dev/null @@ -1,198 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include "gflags/gflags.h" -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using DDim = framework::DDim; - -template -void IndexSampleInner(const framework::ExecutionContext &context, - const LoDTensor &input, const LoDTensor &index, - LoDTensor *output) { - auto input_dims = input.dims(); - auto index_dims = index.dims(); - - int batch_size = input_dims[0]; - auto value_length = input_dims[1]; - auto index_length = index_dims[1]; - int index_ids_num = index.numel(); - - std::vector input_vec; - std::vector index_vec; - paddle::framework::TensorToVector(input, context.device_context(), - &input_vec); - paddle::framework::TensorToVector(index, context.device_context(), - &index_vec); - - std::vector res(index_ids_num); - for (int i = 0; i < index_ids_num; i++) { - int b = floor(i / index_length); - PADDLE_ENFORCE_GE( - index_vec[i], 0, - platform::errors::InvalidArgument( - "Variable value (index) of OP(index_sample) " - "expected >= 0 and < %ld, but got %ld. Please check input " - "value.", - value_length, index_vec[i])); - PADDLE_ENFORCE_LT( - index_vec[i], value_length, - platform::errors::InvalidArgument( - "Variable value (index) of OP(index_sample) " - "expected >= 0 and < %ld, but got %ld. Please check input " - "value.", - value_length, index_vec[i])); - - int v_i = b * value_length + static_cast(index_vec[i]); - T v = input_vec[v_i]; - VLOG(4) << "Index Sample: batch = " << b << " index = " << v_i - << " value = " << v; - res[i] = v; - } - - auto ddim = phi::make_ddim({batch_size, index_length}); - output->mutable_data(context.GetPlace()); - framework::TensorFromVector(res, context.device_context(), output); - output->Resize(ddim); -} - -template -class IndexSampleKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *input_var = ctx.InputVar("X"); - auto *index_var = ctx.InputVar("Index"); - - auto &input_tensor = input_var->Get(); - auto &index_tensor = index_var->Get(); - - auto *out_var = ctx.OutputVar("Out"); - auto *out_tensor = out_var->GetMutable(); - - const auto &index_type = - framework::TransToProtoVarType(index_tensor.dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - IndexSampleInner(ctx, input_tensor, index_tensor, out_tensor); - } else if (index_type == framework::proto::VarType::INT64) { - IndexSampleInner(ctx, input_tensor, index_tensor, out_tensor); - } - } -}; - -template -void IndexSampleGradInner(const framework::ExecutionContext &context, - const LoDTensor &out_grad, const LoDTensor &index, - LoDTensor *x_grad) { - std::vector out_grad_vec; - std::vector index_vec; - paddle::framework::TensorToVector(out_grad, context.device_context(), - &out_grad_vec); - paddle::framework::TensorToVector(index, context.device_context(), - &index_vec); - - auto index_dims = index.dims(); - auto x_grad_dims = x_grad->dims(); - - auto value_length = x_grad_dims[1]; - auto index_length = index_dims[1]; - int index_ids_num = index.numel(); - - std::vector x_grad_vec(x_grad->numel(), 0); - - for (int i = 0; i < index_ids_num; i++) { - int b = floor(i / index_length); - PADDLE_ENFORCE_GE( - index_vec[i], 0, - platform::errors::InvalidArgument( - "Variable value (index) of OP(index_sample_grad) " - "expected >= 0 and < %ld, but got %ld. Please check input " - "value.", - value_length, index_vec[i])); - PADDLE_ENFORCE_LT( - index_vec[i], value_length, - platform::errors::InvalidArgument( - "Variable value (index) of OP(index_sample_grad) " - "expected >= 0 and < %ld, but got %ld. Please check input " - "value.", - value_length, index_vec[i])); - int v_i = b * value_length + static_cast(index_vec[i]); - x_grad_vec[v_i] += out_grad_vec[i]; - } - x_grad->mutable_data(context.GetPlace()); - framework::TensorFromVector(x_grad_vec, context.device_context(), x_grad); - x_grad->Resize(x_grad_dims); -} - -template -class IndexSampleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *index_var = context.InputVar("Index"); - auto *x_grad_var = context.OutputVar(framework::GradVarName("X")); - auto *out_grad_var = context.InputVar(framework::GradVarName("Out")); - - auto &index_tensor = index_var->Get(); - auto &out_grad_tensor = out_grad_var->Get(); - auto *x_grad_tensor = x_grad_var->GetMutable(); - - const auto &index_type = - framework::TransToProtoVarType(index_tensor.dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - IndexSampleGradInner(context, out_grad_tensor, index_tensor, - x_grad_tensor); - } else if (index_type == framework::proto::VarType::INT64) { - IndexSampleGradInner(context, out_grad_tensor, index_tensor, - x_grad_tensor); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/index_sample_op_npu.cc b/paddle/fluid/operators/index_sample_op_npu.cc index f460d0622bccc2e71b1e147c0c9add688c3b11c4..38eb5b4514993412fa3a6c96ccc92e899c57b205 100644 --- a/paddle/fluid/operators/index_sample_op_npu.cc +++ b/paddle/fluid/operators/index_sample_op_npu.cc @@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/index_sample_op.h" - +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc index e0779249c41adc5005bbaba6e19127d2ced3a9ec..7f5136969980b887bb7bbe013690898e66abeac1 100644 --- a/paddle/fluid/operators/inplace_abn_op.cc +++ b/paddle/fluid/operators/inplace_abn_op.cc @@ -17,6 +17,8 @@ #include #include #include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/phi/kernels/batch_norm_grad_kernel.h" +#include "paddle/phi/kernels/batch_norm_kernel.h" namespace paddle { namespace operators { @@ -202,8 +204,7 @@ class InplaceABNOpGradMaker : public framework::SingleGradOpMaker { }; template -class InplaceABNKernel - : public paddle::operators::BatchNormKernel { +class InplaceABNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); @@ -213,7 +214,33 @@ class InplaceABNKernel auto activation = GetInplaceABNActivationType(ctx.Attr("activation")); auto& place = *ctx.template device_context().eigen_device(); - BatchNormKernel::Compute(ctx); + + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* mean = ctx.Input("Mean"); + auto* variance = ctx.Input("Variance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* mean_out = ctx.Output("MeanOut"); + auto* variance_out = ctx.Output("VarianceOut"); + auto* saved_mean = ctx.Output("SavedMean"); + auto* saved_variance = ctx.Output("SavedVariance"); + auto* reserve_space = ctx.Output("ReserveSpace"); + + auto& dev_ctx = ctx.device_context(); + phi::BatchNormKernel( + static_cast::TYPE&>(dev_ctx), + *x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout, + is_test, use_global_stats, trainable_statistics, fuse_with_relu, y, + mean_out, variance_out, saved_mean, saved_variance, reserve_space); auto cur_y = EigenVector::Flatten(*y); InplaceABNActivation functor; @@ -222,8 +249,7 @@ class InplaceABNKernel }; template -class InplaceABNGradKernel - : public paddle::operators::BatchNormGradKernel { +class InplaceABNGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* y = ctx.Input("Y"); @@ -244,7 +270,52 @@ class InplaceABNGradKernel InplaceABNActivation functor; functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy); - BatchNormGradKernel::Compute(ctx); + // BatchNormGradKernel::Compute(ctx); + + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* saved_mean = ctx.Input("SavedMean"); + auto* saved_variance = ctx.Input("SavedVariance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* scale_grad = ctx.Output(framework::GradVarName("Scale")); + auto* bias_grad = ctx.Output(framework::GradVarName("Bias")); + + auto* reserve_space = ctx.Input("ReserveSpace"); + auto* mean = ctx.Input("ReserveSpace"); + auto* variance = ctx.Input("ReserveSpace"); + + paddle::optional space_opt = paddle::none; + paddle::optional mean_opt = paddle::none; + paddle::optional variance_opt = paddle::none; + + if (reserve_space != nullptr) { + space_opt = *reserve_space; + } + + if (mean != nullptr) { + mean_opt = *mean; + } + + if (variance != nullptr) { + variance_opt = *variance; + } + + auto& dev_ctx = ctx.device_context(); + phi::BatchNormGradRawKernel( + static_cast::TYPE&>(dev_ctx), + *d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt, + mean_opt, variance_opt, momentum, epsilon, data_layout, is_test, + use_global_stats, trainable_statistics, fuse_with_relu, true, d_x, + scale_grad, bias_grad); } }; diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu index be7a7bd71711e379ef4d98eb1f9ac5ee2caaace6..db8f8c72d13f8e46f6f9e332c5c2f5164b6d0836 100644 --- a/paddle/fluid/operators/inplace_abn_op.cu +++ b/paddle/fluid/operators/inplace_abn_op.cu @@ -15,14 +15,15 @@ limitations under the License. */ #include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/inplace_abn_op.h" #include "paddle/fluid/operators/sync_batch_norm_op.cu.h" +#include "paddle/phi/kernels/batch_norm_grad_kernel.h" +#include "paddle/phi/kernels/batch_norm_kernel.h" namespace paddle { namespace operators { template class InplaceABNKernel - : public paddle::operators::SyncBatchNormKernel, - public paddle::operators::BatchNormKernel { + : public paddle::operators::SyncBatchNormKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* y = ctx.Output("Y"); @@ -36,7 +37,33 @@ class InplaceABNKernel if (ctx.Attr("use_sync_bn")) { SyncBatchNormKernel::Compute(ctx); } else { - BatchNormKernel::Compute(ctx); + // BatchNormKernel::Compute(ctx); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* mean = ctx.Input("Mean"); + auto* variance = ctx.Input("Variance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* mean_out = ctx.Output("MeanOut"); + auto* variance_out = ctx.Output("VarianceOut"); + auto* saved_mean = ctx.Output("SavedMean"); + auto* saved_variance = ctx.Output("SavedVariance"); + auto* reserve_space = ctx.Output("ReserveSpace"); + + auto& dev_ctx = ctx.device_context(); + phi::BatchNormKernel( + static_cast::TYPE&>(dev_ctx), + *x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout, + is_test, use_global_stats, trainable_statistics, fuse_with_relu, y, + mean_out, variance_out, saved_mean, saved_variance, reserve_space); } auto cur_y = EigenVector::Flatten(*y); @@ -49,8 +76,7 @@ class InplaceABNKernel // https://kevinzakka.github.io/2016/09/14/batch_normalization/ template class InplaceABNGradKernel - : public paddle::operators::SyncBatchNormGradKernel, - public paddle::operators::BatchNormGradKernel { + : public paddle::operators::SyncBatchNormGradKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const auto* y = ctx.Input("Y"); @@ -74,7 +100,50 @@ class InplaceABNGradKernel if (ctx.Attr("use_sync_bn")) { SyncBatchNormGradKernel::Compute(ctx); } else { - BatchNormGradKernel::Compute(ctx); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* saved_mean = ctx.Input("SavedMean"); + auto* saved_variance = ctx.Input("SavedVariance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* scale_grad = ctx.Output(framework::GradVarName("Scale")); + auto* bias_grad = ctx.Output(framework::GradVarName("Bias")); + + auto* reserve_space = ctx.Input("ReserveSpace"); + auto* mean = ctx.Input("ReserveSpace"); + auto* variance = ctx.Input("ReserveSpace"); + + paddle::optional space_opt = paddle::none; + paddle::optional mean_opt = paddle::none; + paddle::optional variance_opt = paddle::none; + + if (reserve_space != nullptr) { + space_opt = *reserve_space; + } + + if (mean != nullptr) { + mean_opt = *mean; + } + + if (variance != nullptr) { + variance_opt = *variance; + } + + auto& dev_ctx = ctx.device_context(); + phi::BatchNormGradRawKernel( + static_cast::TYPE&>(dev_ctx), + *d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt, + mean_opt, variance_opt, momentum, epsilon, data_layout, is_test, + use_global_stats, trainable_statistics, fuse_with_relu, true, d_x, + scale_grad, bias_grad); } } }; diff --git a/paddle/fluid/operators/inverse_op.h b/paddle/fluid/operators/inverse_op.h index 1e061d8b50ae02f9b87f0a0976543467aa0b7dd0..31c22915ec5d052eb11c613d476f6aea541d8c47 100644 --- a/paddle/fluid/operators/inverse_op.h +++ b/paddle/fluid/operators/inverse_op.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/matrix_inverse.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/matrix_inverse.h" namespace paddle { namespace operators { @@ -30,7 +30,7 @@ class InverseKernel : public framework::OpKernel { output->mutable_data(context.GetPlace()); auto& dev_ctx = context.template device_context(); - math::MatrixInverseFunctor mat_inv; + phi::funcs::MatrixInverseFunctor mat_inv; mat_inv(dev_ctx, *input, output); } }; diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc index 2750367dc773925e998507db4690e39c15f985d0..c835bb3cf60bfbf71b585828c74ac45f6bc91f8b 100644 --- a/paddle/fluid/operators/is_empty_op.cc +++ b/paddle/fluid/operators/is_empty_op.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/is_empty_op.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -24,12 +26,6 @@ class IsEmptyOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "IsEmpty"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "IsEmpty"); - ctx->SetOutputDim("Out", {1}); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto *x = ctx.Input("X"); @@ -56,12 +52,10 @@ It will just return product(tensor.ddims()) > 0; } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(is_empty, IsEmptyInferShapeFunctor, + PD_INFER_META(phi::IsEmptyInferMeta)); REGISTER_OPERATOR( is_empty, ops::IsEmptyOp, ops::IsEmptyOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - is_empty, ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel); + paddle::framework::EmptyGradOpMaker, + IsEmptyInferShapeFunctor); diff --git a/paddle/fluid/operators/is_empty_op.cu.cc b/paddle/fluid/operators/is_empty_op.cu.cc deleted file mode 100644 index 3c256503baf6ba3bc8f8dff866a2ce9c57ec5bf1..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/is_empty_op.cu.cc +++ /dev/null @@ -1,23 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/is_empty_op.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - is_empty, ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel); diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc index 735fffa7203b1213fccec0c4098048e85a6b24f8..cfa370ff9cb19dfb7d488b03cba52c115083cdc8 100644 --- a/paddle/fluid/operators/isfinite_v2_op.cc +++ b/paddle/fluid/operators/isfinite_v2_op.cc @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/isfinite_v2_op.h" - #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace framework { @@ -49,11 +51,6 @@ class OverflowV2Op : public framework::OperatorWithKernel { const framework::VariableNameMap &outputs, const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "isfinitev2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "isfinitev2"); - UnaryOpUnchangedInferShape(ctx); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -104,6 +101,14 @@ element of X as a tensor. } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(isinf_v2, IsinfInferShapeFunctor, + PD_INFER_META(phi::IsfiniteInferMeta)); + +DECLARE_INFER_SHAPE_FUNCTOR(isnan_v2, IsnanInferShapeFunctor, + PD_INFER_META(phi::IsfiniteInferMeta)); + +DECLARE_INFER_SHAPE_FUNCTOR(isfinite_v2, IsfiniteInferShapeFunctor, + PD_INFER_META(phi::IsfiniteInferMeta)); #define REGISTER_V2OP_MAKER(op_type, comment) \ namespace paddle { \ @@ -124,50 +129,17 @@ REGISTER_V2OP_MAKER(isfinite_v2, "isfinitev2(X)"); REGISTER_OPERATOR( isinf_v2, ops::OverflowV2Op, ops::_isinf_v2OverflowV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + IsinfInferShapeFunctor); REGISTER_OPERATOR( isnan_v2, ops::OverflowV2Op, ops::_isnan_v2OverflowV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + IsnanInferShapeFunctor); REGISTER_OPERATOR( isfinite_v2, ops::OverflowV2Op, ops::_isfinite_v2OverflowV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -REGISTER_OP_CPU_KERNEL(isnan_v2, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CPU_KERNEL( - isinf_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CPU_KERNEL( - isfinite_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); + paddle::framework::EmptyGradOpMaker, + IsfiniteInferShapeFunctor); diff --git a/paddle/fluid/operators/isfinite_v2_op.cu b/paddle/fluid/operators/isfinite_v2_op.cu deleted file mode 100644 index 1b9f19d36dfa0f590f96577295ffb12e4456d2e5..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/isfinite_v2_op.cu +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/isfinite_v2_op.h" -#include "paddle/fluid/platform/float16.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(isnan_v2, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CUDA_KERNEL( - isinf_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CUDA_KERNEL( - isfinite_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); diff --git a/paddle/fluid/operators/kthvalue_op.cu b/paddle/fluid/operators/kthvalue_op.cu index 4f30c58d375008abb3509989f90bcd9fec91fb38..f6f56f70f1a11971b31e679ef879f2d1d0a96085 100644 --- a/paddle/fluid/operators/kthvalue_op.cu +++ b/paddle/fluid/operators/kthvalue_op.cu @@ -16,7 +16,6 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/kthvalue_op.h" #include "paddle/fluid/operators/top_k_function_cuda.h" -#include "paddle/fluid/operators/top_k_v2_op.h" #ifdef __NVCC__ #include "cub/cub.cuh" #endif diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h index b31c7a1cde0f18edb00435805ce4b2a089f9eb1a..412ae3c49b5f3cc9fc2422aa220af324e6d99b69 100644 --- a/paddle/fluid/operators/layer_norm_kernel.cu.h +++ b/paddle/fluid/operators/layer_norm_kernel.cu.h @@ -22,10 +22,10 @@ limitations under the License. */ namespace cub = hipcub; #endif -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" namespace paddle { namespace operators { @@ -186,8 +186,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( const ScaleT *__restrict__ gamma_ptr, const ScaleT *__restrict__ beta_ptr, U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr, T *__restrict__ y_ptr) { - using Vec = platform::AlignedVector; - using Vec_scale = platform::AlignedVector; + using Vec = phi::AlignedVector; + using Vec_scale = phi::AlignedVector; const int tidx = threadIdx.x; const int bidx = blockIdx.x; @@ -203,8 +203,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( Vec_scale beta[LDGS]; #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Load(gamma_ptr + col * VecSize, &gamma[it]); - platform::Load(beta_ptr + col * VecSize, &beta[it]); + phi::Load(gamma_ptr + col * VecSize, &gamma[it]); + phi::Load(beta_ptr + col * VecSize, &beta[it]); col += THREADS_PER_ROW; } @@ -213,8 +213,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( Vec x[LDGS]; #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, - &x[it]); + phi::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]); col += THREADS_PER_ROW; } U xf[LDGS * VecSize]; @@ -276,8 +275,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Store(x[it], - y_ptr + row * LN_NUM_COLS + col * VecSize); + phi::Store(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize); col += THREADS_PER_ROW; } } @@ -401,9 +399,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( U *__restrict__ dgamma_temp_ptr, U *__restrict__ dbeta_temp_ptr, T *__restrict__ dx_ptr, const MaskType *mask_ptr = nullptr, T factor = static_cast(0), T *d_dropout_src_ptr = nullptr) { - using Vec = platform::AlignedVector; - using Vec_scale = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using Vec = phi::AlignedVector; + using Vec_scale = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; const int tidx = threadIdx.x; const int bidx = blockIdx.x; @@ -439,7 +437,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( int col = c; #pragma unroll for (int it = 0; it < LDGS; it++) { - platform::Load(gamma_ptr + col * VecSize, &gamma[it]); + phi::Load(gamma_ptr + col * VecSize, &gamma[it]); col += THREADS_PER_ROW; } @@ -452,12 +450,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( int col = c; #pragma unroll for (int it = 0; it < LDGS; it++) { - platform::Load(dout_ptr + row * LN_NUM_COLS + col * VecSize, - &dout[it]); - platform::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, - &x[it]); + phi::Load(dout_ptr + row * LN_NUM_COLS + col * VecSize, + &dout[it]); + phi::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]); if (isFusedDropoutResidualLn) { - platform::Load( + phi::Load( mask_ptr + row * LN_NUM_COLS + col * VecSize, &mask_vec[it]); } @@ -474,11 +471,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( for (int it = 0; it < LDGS; it++) { #pragma unroll for (int jt = 0; jt < VecSize; jt++) { - U x_tmp = x[it][jt]; + U x_tmp = static_cast(x[it][jt]); U y_tmp = var_cur_row * (x_tmp - mean_cur_row); U dy_tmp = static_cast(gamma[it][jt]) * - static_cast(dout[it][jt]); // scale * dy - U dout_tmp = dout[it][jt]; // dy + static_cast(dout[it][jt]); // scale * dy + U dout_tmp = static_cast(dout[it][jt]); // dy // used for get dx (row reduction) sum_loss1 += dy_tmp; // scale * dy, sum_1 @@ -552,10 +549,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( col = c; #pragma unroll for (int it = 0; it < LDGS; it++) { - platform::Store(x[it], - dx_ptr + row * LN_NUM_COLS + col * VecSize); + phi::Store(x[it], dx_ptr + row * LN_NUM_COLS + col * VecSize); if (isFusedDropoutResidualLn) { - platform::Store( + phi::Store( dout[it], d_dropout_src_ptr + row * LN_NUM_COLS + col * VecSize); } col += THREADS_PER_ROW; @@ -641,7 +637,7 @@ template < __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel( const int rows, U *__restrict__ dg_part_, U *__restrict__ db_part_, ScaleT *__restrict__ dg_, ScaleT *__restrict__ db_) { - using Vec = platform::AlignedVector; + using Vec = phi::AlignedVector; static_assert(VEC_COLS == LN_NUM_COLS / VecSize, ""); const int tidx = threadIdx.x; @@ -669,8 +665,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel( for (int row = r; row < rows; row += ROWS_PER_CTA) { Vec dg; Vec db; - platform::Load(dg_part_ptr, &dg); - platform::Load(db_part_ptr, &db); + phi::Load(dg_part_ptr, &dg); + phi::Load(db_part_ptr, &db); dg_part_ptr += ROWS_PER_CTA * LN_NUM_COLS; db_part_ptr += ROWS_PER_CTA * LN_NUM_COLS; diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu index d439b3220d96ecd1107d6c29850d3d5356a01e09..dfe73d3727132ae9b8f71e2a415ef5193f303493 100644 --- a/paddle/fluid/operators/layer_norm_op.cu +++ b/paddle/fluid/operators/layer_norm_op.cu @@ -259,6 +259,21 @@ REGISTER_OP_CUDA_KERNEL( ops::LayerNormGradKernel, ops::LayerNormGradKernel); +#elif CUDNN_VERSION_MIN(8, 1, 0) +REGISTER_OP_CUDA_KERNEL( + layer_norm, + ops::LayerNormKernel, + ops::LayerNormKernel, + ops::LayerNormKernel, + ops::LayerNormKernel); +REGISTER_OP_CUDA_KERNEL( + layer_norm_grad, + ops::LayerNormGradKernel, + ops::LayerNormGradKernel, + ops::LayerNormGradKernel, + ops::LayerNormGradKernel); #else REGISTER_OP_CUDA_KERNEL( layer_norm, diff --git a/paddle/fluid/operators/lerp_op.cc b/paddle/fluid/operators/lerp_op.cc index 0aaefc7ca75eb0f98e35200f0a1940aae07315b2..5e053445379118b37c9b0e0bdcb01adaec65b6c1 100644 --- a/paddle/fluid/operators/lerp_op.cc +++ b/paddle/fluid/operators/lerp_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -20,49 +23,6 @@ namespace operators { class LerpOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "lerp"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "lerp"); - OP_INOUT_CHECK(ctx->HasInput("Weight"), "Input", "Weight", "lerp"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "lerp"); - - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - auto w_dims = ctx->GetInputDim("Weight"); - framework::DDim out_dims; - out_dims = GetOutputDims(x_dims, y_dims); - if (w_dims.size() > 1 || w_dims[0] != 1) { - out_dims = GetOutputDims(out_dims, w_dims); - } - - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } - - private: - framework::DDim GetOutputDims(const framework::DDim& s_dims, - const framework::DDim& l_dims) const { - if (s_dims.size() > l_dims.size()) { - return GetOutputDims(l_dims, s_dims); - } - std::vector shapes = phi::vectorize(l_dims); - for (int i = s_dims.size() - 1, j = l_dims.size() - 1; i >= 0; --i, --j) { - int64_t s = s_dims[i]; - int64_t l = l_dims[j]; - if (s != l) { - if (l == 1) { - shapes[j] = s; - } else if (s != 1) { - PADDLE_THROW(platform::errors::InvalidArgument( - "The shape of tensor a %s:%d must match shape of tensor b " - "%s:%d.", - s_dims.to_str(), i, l_dims.to_str(), j)); - } - } - } - return phi::make_ddim(shapes); - } }; class LerpOpMaker : public framework::OpProtoAndCheckerMaker { @@ -125,10 +85,12 @@ DECLARE_INPLACE_OP_INFERER(LerpInplaceInferer, {"X", "Out"}); } // namespace operators } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(lerp, LerpInferShapeFunctor, + PD_INFER_META(phi::LerpInferMeta)); REGISTER_OPERATOR( lerp, paddle::operators::LerpOp, paddle::operators::LerpOpMaker, paddle::operators::LerpOpGradMaker, paddle::operators::LerpOpGradMaker, - paddle::operators::LerpInplaceInferer); + paddle::operators::LerpInplaceInferer, LerpInferShapeFunctor); REGISTER_OPERATOR(lerp_grad, paddle::operators::LerpGradOp); diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc index fe271fa5e893a750bdbbdc05ac4b7835205ebe66..378c7573d6129abc28bd53dd6f964e5c726cce34 100644 --- a/paddle/fluid/operators/linspace_op.cc +++ b/paddle/fluid/operators/linspace_op.cc @@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/linspace_op.h" #include + +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -23,33 +27,6 @@ class LinspaceOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Start"), "Input", "Start", "linspace"); - OP_INOUT_CHECK(ctx->HasInput("Stop"), "Input", "Stop", "linspace"); - OP_INOUT_CHECK(ctx->HasInput("Num"), "Input", "Num", "linspace"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "linspace"); - - auto s_dims = ctx->GetInputDim("Start"); - PADDLE_ENFORCE_EQ((s_dims.size() == 1) && (s_dims[0] == 1), true, - platform::errors::InvalidArgument( - "The shape of Input(Start) must be [1]," - "but received input shape is [%s].", - s_dims)); - auto e_dims = ctx->GetInputDim("Stop"); - PADDLE_ENFORCE_EQ((e_dims.size() == 1) && (e_dims[0] == 1), true, - platform::errors::InvalidArgument( - "The shape of Input(Stop) must be [1]," - "but received input shape is [%s].", - e_dims)); - auto step_dims = ctx->GetInputDim("Num"); - PADDLE_ENFORCE_EQ( - (step_dims.size() == 1) && (step_dims[0] == 1), true, - platform::errors::InvalidArgument("The shape of Input(Num) must be [1]," - "but received input shape is [%s].", - step_dims)); - ctx->SetOutputDim("Out", {-1}); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -88,11 +65,13 @@ class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(linspace, ops::LinspaceOp, ops::LinspaceOpMaker); -REGISTER_OP_CPU_KERNEL(linspace, ops::CPULinspaceKernel, - ops::CPULinspaceKernel, - ops::CPULinspaceKernel, - ops::CPULinspaceKernel); +DECLARE_INFER_SHAPE_FUNCTOR(linspace, LinspaceInferShapeFunctor, + PD_INFER_META(phi::LinspaceInferMeta)); +REGISTER_OPERATOR( + linspace, ops::LinspaceOp, ops::LinspaceOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + LinspaceInferShapeFunctor); REGISTER_OP_VERSION(linspace) .AddCheckpoint( diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu deleted file mode 100644 index aa625a7f5b9df0aa76872c56a3769f1186125bf5..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/linspace_op.cu +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/linspace_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -__global__ void LinspaceKernel(T start, T stop, double step, int64_t size, - T* out) { - int64_t index = blockIdx.x * blockDim.x + threadIdx.x; - - for (; index < size; index += blockDim.x * gridDim.x) { - if (index < size / 2) { - out[index] = static_cast(start + step * index); - } else { - out[index] = static_cast(stop - step * (size - index - 1)); - } - } -} - -template -__global__ void LinspaceSpecialKernel(T start, T* out) { - out[0] = static_cast(start); -} - -template -class CUDALinspaceKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* pre_start = context.Input("Start"); - auto* pre_stop = context.Input("Stop"); - auto* num_t = context.Input("Num"); - auto* out = context.Output("Out"); - auto dtype = static_cast( - context.Attr("dtype")); - - Tensor start_t; - Tensor stop_t; - auto start_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_start->dtype()), context.GetPlace()); - auto stop_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_stop->dtype()), context.GetPlace()); - auto out_dtype = framework::OpKernelType(dtype, context.GetPlace()); - framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t); - framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t); - - framework::Tensor n_start; - framework::Tensor n_stop; - framework::Tensor n_num; - framework::TensorCopy(start_t, platform::CPUPlace(), &n_start); - T start = n_start.data()[0]; - framework::TensorCopy(stop_t, platform::CPUPlace(), &n_stop); - T stop = n_stop.data()[0]; - framework::TensorCopy(*num_t, platform::CPUPlace(), &n_num); - int64_t num = static_cast(n_num.data()[0]); - - PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument( - "The num of linspace op should be larger " - "than 0, but received num is %d", - num)); - - out->Resize(phi::make_ddim({num})); - T* out_data = out->mutable_data(context.GetPlace()); - - double step = 0; - auto stream = context.cuda_device_context().stream(); - int block = 512; - int grid = (num + block - 1) / block; - if (num != 1) { - step = (static_cast(stop - start)) / (num - 1); - LinspaceKernel<<>>(start, stop, step, num, - out_data); - } else { - LinspaceSpecialKernel<<>>(start, out_data); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(linspace, ops::CUDALinspaceKernel, - ops::CUDALinspaceKernel, - ops::CUDALinspaceKernel, - ops::CUDALinspaceKernel); diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h deleted file mode 100644 index ae51f1221cc09b433e784ecaf52da69e41fc3706..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/linspace_op.h +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class CPULinspaceKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* pre_start = context.Input("Start"); - auto* pre_stop = context.Input("Stop"); - int32_t num = context.Input("Num")->data()[0]; - auto* out = context.Output("Out"); - auto dtype = static_cast( - context.Attr("dtype")); - - Tensor start_t; - Tensor stop_t; - auto start_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_start->dtype()), context.GetPlace()); - auto stop_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_stop->dtype()), context.GetPlace()); - auto out_dtype = framework::OpKernelType(dtype, context.GetPlace()); - framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t); - framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t); - - T start = start_t.data()[0]; - T stop = stop_t.data()[0]; - PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument( - "The num of linspace op should be larger " - "than 0, but received num is %d", - num)); - - out->Resize(phi::make_ddim({num})); - - T* out_data = out->mutable_data(context.GetPlace()); - - if (num > 1) { - // step should be of double type for all types - double step = (static_cast(stop - start)) / (num - 1); - int half_num = num / 2; - for (int i = 0; i < num; ++i) { - if (i < half_num) { - out_data[i] = static_cast(start + step * i); - } else { - out_data[i] = static_cast(stop - step * (num - i - 1)); - } - } - } else { - out_data[0] = static_cast(start); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc index df4d0ebbccd5e3fb4dd6131fb5fbcaa9056bd9d6..883e3597d8a31138a6ff1e4cfcb05a165eafc4a6 100644 --- a/paddle/fluid/operators/log_loss_op.cc +++ b/paddle/fluid/operators/log_loss_op.cc @@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/log_loss_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -21,43 +24,6 @@ namespace operators { class LogLossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Predicted"), "Input", "Predicted", "LogLoss"); - OP_INOUT_CHECK(ctx->HasInput("Labels"), "Input", "Labels", "LogLoss"); - - auto pred_dims = ctx->GetInputDim("Predicted"); - auto label_dims = ctx->GetInputDim("Labels"); - - if (ctx->IsRuntime() || - (phi::product(pred_dims) > 0 && phi::product(label_dims) > 0)) { - PADDLE_ENFORCE_EQ( - pred_dims, label_dims, - platform::errors::InvalidArgument( - "The dimensions of Input(Predicted) must be equal to the" - "dimensions of Input(Labels), but received dimensions of " - "Input(Predicted)" - "is [%s], received dimensions of Input(Labels) is [%s].", - pred_dims, label_dims)); - } - PADDLE_ENFORCE_EQ(pred_dims.size(), 2, - platform::errors::InvalidArgument( - "The dimensions of Input(Predicted) must be 2," - "But received dimensions of Input(Predicted)" - "is [%d]", - pred_dims.size())); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ( - pred_dims[1], 1, - platform::errors::InvalidArgument( - "Each row of Input(Predicted) contains a real value, " - "so the 2nd dimension of Input(X) must be 1," - "But got [%d]", - pred_dims[1])); - } - ctx->SetOutputDim("Loss", {pred_dims[0], 1}); - ctx->ShareLoD("Predicted", "Loss"); - } }; template @@ -145,17 +111,10 @@ class LogLossGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(log_loss, LogLossInferShapeFunctor, + PD_INFER_META(phi::LogLossInferMeta)); REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker, ops::LogLossGradMaker, - ops::LogLossGradMaker); + ops::LogLossGradMaker, + LogLossInferShapeFunctor); REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp); -REGISTER_OP_CPU_KERNEL( - log_loss, ops::LogLossKernel); -REGISTER_OP_CPU_KERNEL( - log_loss_grad, - ops::LogLossGradKernel); -REGISTER_OP_CUDA_KERNEL( - log_loss, ops::LogLossKernel); -REGISTER_OP_CUDA_KERNEL( - log_loss_grad, - ops::LogLossGradKernel); diff --git a/paddle/fluid/operators/log_loss_op.h b/paddle/fluid/operators/log_loss_op.h deleted file mode 100644 index e7985ab810b138da62390fae29eb4a6cf638c897..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/log_loss_op.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -template -using EigenVector = framework::EigenVector; - -template -class LogLossKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* loss_out = ctx.Output("Loss"); - - loss_out->mutable_data(ctx.GetPlace()); - - auto epsilon = static_cast(ctx.Attr("epsilon")); - - auto prediction = EigenVector::Flatten(*ctx.Input("Predicted")); - auto label = EigenVector::Flatten(*ctx.Input("Labels")); - - auto loss = EigenVector::Flatten(*loss_out); - auto& place = *ctx.template device_context().eigen_device(); - - EigenLogLoss, T>::Eval( - place, loss, prediction, label, epsilon); - } -}; - -template -class LogLossGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto epsilon = static_cast(ctx.Attr("epsilon")); - - auto prediction = EigenVector::Flatten(*ctx.Input("Predicted")); - auto label = EigenVector::Flatten(*ctx.Input("Labels")); - - auto* dloss = ctx.Input(framework::GradVarName("Loss")); - auto* dpred = ctx.Output(framework::GradVarName("Predicted")); - - auto dl = EigenVector::Flatten(*dloss); - auto& place = *ctx.template device_context().eigen_device(); - - if (dpred) { - dpred->mutable_data(ctx.GetPlace()); - auto dx = framework::EigenVector::Flatten(*dpred); - EigenLogLossGrad, T>::Eval( - place, dx, dl, prediction, label, epsilon); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc index 9775910bba5cf30096f395c20d9dff3b5b1e541f..f103a69707a214400bbe2734409df4d9de3902e8 100644 --- a/paddle/fluid/operators/log_loss_op_npu.cc +++ b/paddle/fluid/operators/log_loss_op_npu.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/log_loss_op.h" #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/log_loss_op_xpu.cc b/paddle/fluid/operators/log_loss_op_xpu.cc index b2e68e9870d3c4f240fe35a4cbec811aefbc13f1..aa5fdd86745d6932052347f3dc11b14e3d447d20 100644 --- a/paddle/fluid/operators/log_loss_op_xpu.cc +++ b/paddle/fluid/operators/log_loss_op_xpu.cc @@ -10,11 +10,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/log_loss_op.h" #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class LogLossXPUKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu index 034e67568b34cebdfeddb884345b21cd99afb34f..8770abdac838f63b0c9f3a95b1ac0283a80ecbf2 100644 --- a/paddle/fluid/operators/log_softmax_op.cu +++ b/paddle/fluid/operators/log_softmax_op.cu @@ -13,9 +13,9 @@ // limitations under the License. #include -#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/log_softmax_op.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/functors.h" @@ -311,7 +311,7 @@ void LaunchLogSoftmaxForwardCUDAKernelNotLastAxis(T *output_data, template class LogSoftmaxKernel : public framework::OpKernel { - using MPDType = typename details::MPTypeTrait::Type; + using MPDType = typename phi::dtype::MPTypeTrait::Type; public: void Compute(const framework::ExecutionContext &context) const override { @@ -433,7 +433,7 @@ void LaunchSoftmaxBackwardForLastAxis(T *grad_input, const T *grad_output, template class LogSoftmaxGradKernel : public framework::OpKernel { - using MPDType = typename details::MPTypeTrait::Type; + using MPDType = typename phi::dtype::MPTypeTrait::Type; public: void Compute(const framework::ExecutionContext &context) const override { @@ -468,16 +468,18 @@ class LogSoftmaxGradKernel } }; -} // operators -} // paddle +} // namespace operators +} // namespace paddle namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( log_softmax, ops::LogSoftmaxKernel, ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel); + ops::LogSoftmaxKernel, + ops::LogSoftmaxKernel); REGISTER_OP_CUDA_KERNEL( log_softmax_grad, ops::LogSoftmaxGradKernel, ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel); + ops::LogSoftmaxGradKernel, + ops::LogSoftmaxGradKernel); diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h index 62f9cd26c418399ac967e62a17d48f0c470b1ae7..4ec3072a96d445805f482060585a888a2a165413 100644 --- a/paddle/fluid/operators/lstm_op.h +++ b/paddle/fluid/operators/lstm_op.h @@ -15,10 +15,10 @@ limitations under the License. */ #pragma once #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/detail/activation_functions.h" -#include "paddle/fluid/operators/math/lstm_compute.h" -#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/detail/activation_functions.h" +#include "paddle/phi/kernels/funcs/lstm_compute.h" +#include "paddle/phi/kernels/funcs/sequence2batch.h" namespace paddle { namespace operators { @@ -31,7 +31,7 @@ inline void ReorderInitState(const DeviceContext& ctx, const framework::Tensor& src, framework::Vector index_lod, framework::Tensor* dst, bool indexed_src) { - math::CopyMatrixRowsFunctor row_shuffle; + phi::funcs::CopyMatrixRowsFunctor row_shuffle; dst->mutable_data(src.dims(), ctx.GetPlace()); row_shuffle(ctx, src, index_lod, dst, indexed_src); } @@ -64,7 +64,7 @@ class LSTMKernel : public framework::OpKernel { cell_out->mutable_data(ctx.GetPlace()); bool is_reverse = ctx.Attr("is_reverse"); - math::LoDTensor2BatchFunctor to_batch; + phi::funcs::LoDTensor2BatchFunctor to_batch; auto& device_ctx = ctx.template device_context(); to_batch(device_ctx, *input, batch_gate, true, is_reverse); @@ -80,7 +80,7 @@ class LSTMKernel : public framework::OpKernel { add_bias(device_ctx, *batch_gate, gate_bias, batch_gate); } - math::LstmMetaValue lstm_value; + phi::funcs::LstmMetaValue lstm_value; if (bias && ctx.Attr("use_peepholes")) { T* bias_data = const_cast(bias->data()); // the code style in LstmMetaValue will be updated later. @@ -121,11 +121,11 @@ class LSTMKernel : public framework::OpKernel { auto batch_starts = batch_gate->lod()[0]; size_t num_batch = batch_starts.size() - 1; - auto gate_act = math::detail::GetActivationType( + auto gate_act = phi::funcs::detail::GetActivationType( ctx.Attr("gate_activation")); - auto cell_act = math::detail::GetActivationType( + auto cell_act = phi::funcs::detail::GetActivationType( ctx.Attr("cell_activation")); - auto cand_act = math::detail::GetActivationType( + auto cand_act = phi::funcs::detail::GetActivationType( ctx.Attr("candidate_activation")); auto blas = phi::funcs::GetBlas(device_ctx); @@ -166,13 +166,13 @@ class LSTMKernel : public framework::OpKernel { lstm_value.state_value = cell_t.data(); lstm_value.state_active_value = cell_pre_act_t.data(); T cell_clip = 0.0; - math::LstmUnitFunctor::compute( + phi::funcs::LstmUnitFunctor::compute( device_ctx, lstm_value, frame_size, cur_batch_size, cell_clip, gate_act, cell_act, cand_act); lstm_value.prev_state_value = lstm_value.state_value; } - math::Batch2LoDTensorFunctor to_seq; + phi::funcs::Batch2LoDTensorFunctor to_seq; batch_hidden.set_lod(batch_gate->lod()); // restore the output hidden in LoDTensor from the batch hidden to_seq(device_ctx, batch_hidden, hidden_out); @@ -241,7 +241,7 @@ class LSTMGradKernel : public framework::OpKernel { ") should be %d, but received %d in LSTM@Grad operator.", frame_size, out_dims[1])); - math::LstmMetaValue lstm_value; + phi::funcs::LstmMetaValue lstm_value; if (bias && ctx.Attr("use_peepholes")) { T* bias_data = const_cast(bias->data()); lstm_value.check_ig = bias_data + 4 * frame_size; @@ -253,7 +253,7 @@ class LSTMGradKernel : public framework::OpKernel { lstm_value.check_og = nullptr; } - math::LstmMetaGrad lstm_grad; + phi::funcs::LstmMetaGrad lstm_grad; if (bias && bias_g) { bias_g->mutable_data(ctx.GetPlace()); @@ -270,7 +270,7 @@ class LSTMGradKernel : public framework::OpKernel { lstm_grad.check_og_grad = nullptr; } - math::LoDTensor2BatchFunctor to_batch; + phi::funcs::LoDTensor2BatchFunctor to_batch; auto ToBatch = [&batch_gate, &to_batch]( const DeviceContext& ctx, const framework::LoDTensor& src, @@ -293,11 +293,11 @@ class LSTMGradKernel : public framework::OpKernel { batch_gate_g.mutable_data(batch_gate->dims(), ctx.GetPlace()); batch_gate_g.set_lod(batch_gate->lod()); - auto gate_act = math::detail::GetActivationType( + auto gate_act = phi::funcs::detail::GetActivationType( ctx.Attr("gate_activation")); - auto cell_act = math::detail::GetActivationType( + auto cell_act = phi::funcs::detail::GetActivationType( ctx.Attr("cell_activation")); - auto cand_act = math::detail::GetActivationType( + auto cand_act = phi::funcs::detail::GetActivationType( ctx.Attr("candidate_activation")); auto batch_starts = batch_gate->lod()[0]; @@ -338,7 +338,7 @@ class LSTMGradKernel : public framework::OpKernel { lstm_grad.state_active_grad = nullptr; int cur_batch_size = bend - bstart; T cell_clip = 0.0; - math::LstmUnitGradFunctor::compute( + phi::funcs::LstmUnitGradFunctor::compute( device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size, cell_clip, gate_act, cell_act, cand_act); @@ -369,7 +369,7 @@ class LSTMGradKernel : public framework::OpKernel { } } - math::Batch2LoDTensorFunctor to_seq; + phi::funcs::Batch2LoDTensorFunctor to_seq; if (in_g) { /* backward data */ in_g->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h index 96c074f1efb418a872b65b08affc7bdb0ed6a02f..5d24c0b70d3477224e89ca47924816e14abc5c56 100644 --- a/paddle/fluid/operators/lstmp_op.h +++ b/paddle/fluid/operators/lstmp_op.h @@ -18,12 +18,12 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/operators/math/detail/activation_functions.h" -#include "paddle/fluid/operators/math/lstm_compute.h" -#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/transform.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/detail/activation_functions.h" +#include "paddle/phi/kernels/funcs/lstm_compute.h" +#include "paddle/phi/kernels/funcs/sequence2batch.h" namespace paddle { namespace operators { @@ -72,7 +72,7 @@ inline void ReorderInitState(const DeviceContext& ctx, const framework::Tensor& src, framework::Vector index, framework::Tensor* dst, bool indexed_src) { - math::CopyMatrixRowsFunctor row_shuffle; + phi::funcs::CopyMatrixRowsFunctor row_shuffle; dst->mutable_data(src.dims(), ctx.GetPlace()); row_shuffle(ctx, src, index, dst, indexed_src); } @@ -81,15 +81,15 @@ template class LSTMPKernel : public framework::OpKernel { public: template - void ActCompute(const math::detail::ActivationType act_type, const Device& d, - X x, Y y, platform::Place place) const { - if (act_type == math::detail::ActivationType::kIdentity) { + void ActCompute(const phi::funcs::detail::ActivationType act_type, + const Device& d, X x, Y y, platform::Place place) const { + if (act_type == phi::funcs::detail::ActivationType::kIdentity) { y.device(d) = x; - } else if (act_type == math::detail::ActivationType::kSigmoid) { + } else if (act_type == phi::funcs::detail::ActivationType::kSigmoid) { SigmoidFunctor()(d, x, y); - } else if (act_type == math::detail::ActivationType::kTanh) { + } else if (act_type == phi::funcs::detail::ActivationType::kTanh) { TanhFunctor()(d, x, y); - } else if (act_type == math::detail::ActivationType::kReLU) { + } else if (act_type == phi::funcs::detail::ActivationType::kReLU) { if (place == platform::CPUPlace()) ReluCPUFunctor()(d, x, y); else @@ -120,7 +120,7 @@ class LSTMPKernel : public framework::OpKernel { cell_out->mutable_data(ctx.GetPlace()); bool is_reverse = ctx.Attr("is_reverse"); - math::LoDTensor2BatchFunctor to_batch; + phi::funcs::LoDTensor2BatchFunctor to_batch; auto& device_ctx = ctx.template device_context(); to_batch(device_ctx, *input, batch_gate, true, is_reverse); @@ -137,7 +137,7 @@ class LSTMPKernel : public framework::OpKernel { add_bias(device_ctx, *batch_gate, gate_bias, batch_gate); } - math::LstmMetaValue lstmp_value; + phi::funcs::LstmMetaValue lstmp_value; if (bias && ctx.Attr("use_peepholes")) { T* bias_data = const_cast(bias->data()); // the code style in LstmpMetaValue will be updated later. @@ -176,13 +176,13 @@ class LSTMPKernel : public framework::OpKernel { auto batch_starts = batch_gate->lod()[0]; size_t num_batch = batch_starts.size() - 1; - auto gate_act = math::detail::GetActivationType( + auto gate_act = phi::funcs::detail::GetActivationType( ctx.Attr("gate_activation")); - auto cell_act = math::detail::GetActivationType( + auto cell_act = phi::funcs::detail::GetActivationType( ctx.Attr("cell_activation")); - auto cand_act = math::detail::GetActivationType( + auto cand_act = phi::funcs::detail::GetActivationType( ctx.Attr("candidate_activation")); - auto proj_act = math::detail::GetActivationType( + auto proj_act = phi::funcs::detail::GetActivationType( ctx.Attr("proj_activation")); auto& place = *ctx.template device_context().eigen_device(); auto blas = phi::funcs::GetBlas(device_ctx); @@ -222,13 +222,13 @@ class LSTMPKernel : public framework::OpKernel { lstmp_value.output_value = hidden_t.data(); lstmp_value.state_value = cell_t.data(); lstmp_value.state_active_value = cell_pre_act_t.data(); - math::LstmUnitFunctor::compute( + phi::funcs::LstmUnitFunctor::compute( device_ctx, lstmp_value, frame_size, cur_batch_size, cell_clip, gate_act, cell_act, cand_act); lstmp_value.prev_state_value = lstmp_value.state_value; blas.MatMul(hidden_t, false, *proj_weight, false, static_cast(1.0), &proj_t, static_cast(0.0)); - if (proj_act != math::detail::ActivationType::kIdentity) { + if (proj_act != phi::funcs::detail::ActivationType::kIdentity) { auto proj_t_dev = EigenMatrix::From(proj_t); ActCompute(cell_act, place, proj_t_dev, proj_t_dev, ctx.GetPlace()); } @@ -242,7 +242,7 @@ class LSTMPKernel : public framework::OpKernel { } } - math::Batch2LoDTensorFunctor to_seq; + phi::funcs::Batch2LoDTensorFunctor to_seq; batch_proj.set_lod(batch_gate->lod()); // restore the output hidden in LoDTensor from the batch hidden to_seq(device_ctx, batch_proj, proj_out); @@ -257,16 +257,16 @@ template class LSTMPGradKernel : public framework::OpKernel { public: template - void ActGradCompute(const math::detail::ActivationType act_type, + void ActGradCompute(const phi::funcs::detail::ActivationType act_type, const Device& d, X x, Y y, DX dx, DY dy) const { // x is dummy and won't be used even in Relu(use y instead) - if (act_type == math::detail::ActivationType::kIdentity) + if (act_type == phi::funcs::detail::ActivationType::kIdentity) dx.device(d) = dy; - else if (act_type == math::detail::ActivationType::kSigmoid) + else if (act_type == phi::funcs::detail::ActivationType::kSigmoid) SigmoidGradFunctor()(d, x, y, dy, dx); - else if (act_type == math::detail::ActivationType::kTanh) + else if (act_type == phi::funcs::detail::ActivationType::kTanh) TanhGradFunctor()(d, x, y, dy, dx); - else if (act_type == math::detail::ActivationType::kReLU) + else if (act_type == phi::funcs::detail::ActivationType::kReLU) ReluGradFunctor()(d, x, y, dy, dx); else PADDLE_THROW( @@ -340,7 +340,7 @@ class LSTMPGradKernel : public framework::OpKernel { "but received %d in LSTMP@Grad operator.", frame_size, out_dims[1])); - math::LstmMetaValue lstmp_value; + phi::funcs::LstmMetaValue lstmp_value; if (bias && ctx.Attr("use_peepholes")) { T* bias_data = const_cast(bias->data()); lstmp_value.check_ig = bias_data + 4 * frame_size; @@ -352,7 +352,7 @@ class LSTMPGradKernel : public framework::OpKernel { lstmp_value.check_og = nullptr; } - math::LstmMetaGrad lstmp_grad; + phi::funcs::LstmMetaGrad lstmp_grad; if (bias && bias_g) { bias_g->mutable_data(ctx.GetPlace()); @@ -369,7 +369,7 @@ class LSTMPGradKernel : public framework::OpKernel { lstmp_grad.check_og_grad = nullptr; } - math::LoDTensor2BatchFunctor to_batch; + phi::funcs::LoDTensor2BatchFunctor to_batch; auto ToBatch = [&batch_gate, &to_batch]( const DeviceContext& ctx, const framework::LoDTensor& src, @@ -393,13 +393,13 @@ class LSTMPGradKernel : public framework::OpKernel { batch_gate_g.mutable_data(batch_gate->dims(), ctx.GetPlace()); batch_gate_g.set_lod(batch_gate->lod()); - auto gate_act = math::detail::GetActivationType( + auto gate_act = phi::funcs::detail::GetActivationType( ctx.Attr("gate_activation")); - auto cell_act = math::detail::GetActivationType( + auto cell_act = phi::funcs::detail::GetActivationType( ctx.Attr("cell_activation")); - auto cand_act = math::detail::GetActivationType( + auto cand_act = phi::funcs::detail::GetActivationType( ctx.Attr("candidate_activation")); - auto proj_act = math::detail::GetActivationType( + auto proj_act = phi::funcs::detail::GetActivationType( ctx.Attr("proj_activation")); auto& place = *ctx.template device_context().eigen_device(); @@ -423,7 +423,7 @@ class LSTMPGradKernel : public framework::OpKernel { _ClipGradFunctor(-1.0 * proj_clip, proj_clip)); } - if (proj_act != math::detail::ActivationType::kIdentity) { + if (proj_act != phi::funcs::detail::ActivationType::kIdentity) { auto cur_proj_dev = EigenMatrix::From(cur_proj); auto proj_g_dev = EigenMatrix::From(proj_g); ActGradCompute(cell_act, place, cur_proj_dev, cur_proj_dev, proj_g_dev, @@ -470,7 +470,7 @@ class LSTMPGradKernel : public framework::OpKernel { lstmp_value.output_value = nullptr; lstmp_grad.state_active_grad = nullptr; - math::LstmUnitGradFunctor::compute( + phi::funcs::LstmUnitGradFunctor::compute( device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size, cell_clip, gate_act, cell_act, cand_act); @@ -503,7 +503,7 @@ class LSTMPGradKernel : public framework::OpKernel { } } - math::Batch2LoDTensorFunctor to_seq; + phi::funcs::Batch2LoDTensorFunctor to_seq; if (in_g) { /* backward data */ in_g->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h index a4c3d1c81fb3e32aed506381ea1e6fdbdc5066ba..3cbbc62e7bec92f329535e788f19d439c9341a0e 100644 --- a/paddle/fluid/operators/lstsq_op.h +++ b/paddle/fluid/operators/lstsq_op.h @@ -46,7 +46,7 @@ template class LstsqCPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - using ValueType = phi::funcs::Real; + using ValueType = phi::dtype::Real; const Tensor& x = *context.Input("X"); auto y = context.Input("Y"); @@ -169,7 +169,7 @@ class LstsqCPUKernel : public framework::OpKernel { &rank_32, &wkopt, lwork, &rwkopt, &info); } - lwork = std::max(1, static_cast(phi::funcs::Real(wkopt))); + lwork = std::max(1, static_cast(phi::dtype::Real(wkopt))); Tensor work; work.Resize(phi::make_ddim({lwork})); T* work_data = work.mutable_data(context.GetPlace()); diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index ac6566a87030d4c9cf613134cfe85c379fea5e20..31a98d9f630e1c01f3b886cbe91dd3882b384d05 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -1,5 +1,3 @@ -add_subdirectory(detail) - if (WITH_ASCEND_CL) cc_library(beam_search_npu SRCS beam_search_npu.cc DEPS npu_op_runner) endif() @@ -7,6 +5,8 @@ endif() # please add new math_library in alphabetical order if (WITH_ASCEND_CL) math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner) +elseif (WITH_MLU) +math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop) else() math_library(concat_and_split DEPS concat_and_split_functor) endif() @@ -18,8 +18,7 @@ math_library(im2col) math_library(sample_prob) math_library(sampler DEPS generator) -math_library(gru_compute DEPS activation_functions math_function) -math_library(lstm_compute DEPS activation_functions) +# math_library(math_function DEPS blas dense_tensor tensor) math_library(maxouting) math_library(pooling) @@ -29,7 +28,6 @@ else() math_library(selected_rows_functor DEPS selected_rows_utils math_function blas) endif() -math_library(sequence2batch) math_library(sequence_padding) math_library(sequence_pooling DEPS math_function jit_kernel_helper) math_library(sequence_scale) @@ -48,8 +46,6 @@ math_library(vol2col) math_library(prelu) math_library(bert_encoder_functor) math_library(tree2col DEPS math_function) -math_library(matrix_inverse) -math_library(segment_pooling) math_library(matrix_solve) cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) @@ -74,7 +70,6 @@ if(WITH_GPU AND (NOT WITH_ROCM)) endif() endif() -cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) if(WITH_TESTING AND TEST im2col_test) set_tests_properties(im2col_test PROPERTIES TIMEOUT 120) endif() diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc index 46126ac59c892787d2f63956983404843e518ae7..c9308d27c0a3490d9c0094f45a1a9c2d894bbf57 100644 --- a/paddle/fluid/operators/math/concat_and_split.cc +++ b/paddle/fluid/operators/math/concat_and_split.cc @@ -18,6 +18,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #endif +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/operators/mlu/mlu_baseop.h" +#endif #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" @@ -226,6 +229,90 @@ class SplitFunctor { }; #endif +#ifdef PADDLE_WITH_MLU +template +class ConcatFunctor { + public: + void operator()(const platform::MLUDeviceContext& context, + const std::vector& input, int axis, + framework::Tensor* output) { + int dev_id = context.GetPlace().GetDeviceId(); + platform::MLUDeviceGuard guard(dev_id); + + auto ins_size = input.size(); + + const int axis_t = axis; + const int ins_size_t = ins_size; + auto place = context.GetPlace(); + output->mutable_data(place); + + // mlu should do sth + // init ins tensors + std::vector inputs; + std::vector input_descs; + std::vector desc_vector; + for (size_t i = 0; i < ins_size; i++) { + input_descs.emplace_back(MLUCnnlTensorDesc( + input[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(input[i].dtype()))); + desc_vector.push_back(input_descs.back().get()); + inputs.push_back(input[i].data()); + } + // init out tensors + MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(output->dtype())); + + // MLU should do sth + MLUCnnl::Concat(context, ins_size_t, axis_t, desc_vector.data(), + inputs.data(), output_desc.get(), GetBasePtr(output)); + } +}; + +template +class SplitFunctor { + public: + void operator()(const platform::MLUDeviceContext& context, + const framework::Tensor& input, + const std::vector& ref_inputs, + const int axis, std::vector* outputs) { + if (input.numel() == 0) { + return; + } + + int dev_id = context.GetPlace().GetDeviceId(); + platform::MLUDeviceGuard guard(dev_id); + + auto in_dims = input.dims(); + auto out_size = outputs->size(); + + std::vector outs_dims(out_size, in_dims); + for (size_t i = 0; i < out_size; ++i) { + outs_dims[i][axis] = ref_inputs[i]->dims()[axis]; + } + + // init out tensors + std::vector vct_tensor; + std::vector output_descs; + std::vector desc_vector; + for (size_t i = 0; i < out_size; i++) { + (*outputs)[i]->Resize(outs_dims[i]); + (*outputs)[i]->mutable_data(context.GetPlace()); + output_descs.emplace_back( + MLUCnnlTensorDesc(*(*outputs)[i], CNNL_LAYOUT_ARRAY, + ToCnnlDataType((*outputs)[i]->dtype()))); + desc_vector.push_back(output_descs.back().get()); + vct_tensor.push_back(GetBasePtr((*outputs)[i])); + } + // init in tensors + MLUCnnlTensorDesc input_desc(input, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(input.dtype())); + + // MLU should do sth + MLUCnnl::Split(context, out_size, axis, input_desc.get(), input.data(), + desc_vector.data(), vct_tensor.data()); + } +}; +#endif + #define DEFINE_FUNCTOR(type) \ template class ConcatFunctor; \ template class SplitFunctor; @@ -248,6 +335,19 @@ DEFINE_XPU_FUNCTOR(float) FOR_ALL_TYPES(DEFINE_NPU_FUNCTOR) #endif +#ifdef PADDLE_WITH_MLU +#define DEFINE_MLU_FUNCTOR(type) \ + template class ConcatFunctor; \ + template class SplitFunctor; +DEFINE_MLU_FUNCTOR(float) +DEFINE_MLU_FUNCTOR(platform::float16) +DEFINE_MLU_FUNCTOR(int64_t) +DEFINE_MLU_FUNCTOR(bool) +DEFINE_MLU_FUNCTOR(int) +DEFINE_MLU_FUNCTOR(int8_t) +DEFINE_MLU_FUNCTOR(int16_t) +DEFINE_MLU_FUNCTOR(uint8_t) +#endif } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h deleted file mode 100644 index e41f0aedf39ef582b4533b1eeb6ccda1e8ed7e49..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/math/depthwise_conv.h +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { -namespace math { - -using DataLayout = framework::DataLayout; - -/* - * \brief Compute the depthwise convolution which include - * forward process and backpropagation process - */ -template -class DepthwiseConvFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& filter, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, framework::Tensor* output, - const DataLayout data_layout = DataLayout::kNCHW); -}; - -template -class DepthwiseConvInputGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& filter, - const framework::Tensor& output_grad, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, - framework::Tensor* input_grad, - const DataLayout data_layout = DataLayout::kNCHW); -}; - -template -class DepthwiseConvFilterGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output_grad, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, - framework::Tensor* filter_grad, - const DataLayout data_layout = DataLayout::kNCHW); -}; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h index 9b6ebf73d9b09390edb16545d982010eb8692db0..1ade2190bb96e092ad546ace121192a87c8082ff 100644 --- a/paddle/fluid/operators/math/eigen_values_vectors.h +++ b/paddle/fluid/operators/math/eigen_values_vectors.h @@ -63,7 +63,7 @@ struct MatrixEighFunctor { void operator()(const framework::ExecutionContext &ctx, const Tensor &input, Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower, bool has_vectors) { - using ValueType = phi::funcs::Real; + using ValueType = phi::dtype::Real; auto *out_value = eigen_values->mutable_data(ctx.GetPlace()); auto dito = @@ -123,7 +123,7 @@ struct MatrixEighFunctor { for (auto i = 0; i < batch_size; i++) { auto *value_data = out_value + i * values_stride; auto *input_data = input_vector + i * vector_stride; - phi::funcs::lapackEigh>( + phi::funcs::lapackEigh>( jobz, uplo, n, input_data, lda, value_data, work_data, lwork, rwork_data, lrwork, iwork_data, liwork, &info); CheckEighResult(i, info); @@ -151,7 +151,7 @@ struct MatrixEighFunctor { void operator()(const framework::ExecutionContext &ctx, const Tensor &input, Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower, bool has_vectors) { - using ValueType = phi::funcs::Real; + using ValueType = phi::dtype::Real; auto *out_value = eigen_values->mutable_data(ctx.GetPlace()); auto &dev_ctx = ctx.template device_context(); @@ -233,7 +233,7 @@ struct MatrixEighFunctor { } } - using ValueType = phi::funcs::Real; + using ValueType = phi::dtype::Real; inline void EvdBuffer(cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, int n, const T *A, int lda, const ValueType *W, int *lwork) const; diff --git a/paddle/fluid/operators/math/gru_compute.h b/paddle/fluid/operators/math/gru_compute.h deleted file mode 100644 index 70cbfecefc8026f7603e095a53440daeffa29851..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/math/gru_compute.h +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/operators/math/detail/activation_functions.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { -namespace math { - -template -struct GRUMetaValue { - const T *gate_weight; - const T *state_weight; - const T *reset_bias; - T *gate_value; - T *reset_output_value; - T *output_value; - const T *prev_out_value; -}; - -template -struct GRUMetaGrad { - T *gate_weight_grad; - T *state_weight_grad; - T *gate_grad; - T *reset_output_grad; - T *output_grad; - T *prev_out_grad; - T *bias_hh_grad; -}; - -template -struct GRUUnitFunctor { - static void compute(const DeviceContext &context, GRUMetaValue value, - int frame_size, int batch_size, - const detail::ActivationType active_node, - const detail::ActivationType active_gate, - bool origin_mode); -}; - -template -struct GRUUnitGradFunctor { - static void compute(const DeviceContext &context, GRUMetaValue value, - GRUMetaGrad grad, int frame_size, int batch_size, - const detail::ActivationType active_node, - const detail::ActivationType active_gate, - bool origin_mode); -}; - -template -struct GRUUnitFunctorV2 { - static void compute(const DeviceContext &context, GRUMetaValue value, - int frame_size, int batch_size, - const detail::ActivationType active_node, - const detail::ActivationType active_gate); -}; - -template -struct GRUUnitGradFunctorV2 { - static void compute(const DeviceContext &context, GRUMetaValue value, - GRUMetaGrad grad, int frame_size, int batch_size, - const detail::ActivationType active_node, - const detail::ActivationType active_gate); -}; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h index 38692a646111ec468de3fae6df619b33d9b9c8d5..9994ccc10cb13b2f692b18f16182f6bcdad7efa5 100644 --- a/paddle/fluid/operators/math/inclusive_scan.h +++ b/paddle/fluid/operators/math/inclusive_scan.h @@ -115,7 +115,7 @@ static __global__ void InclusiveScanInnerDimCUDAKernel(const T *x, T *y, size_t num_rows, size_t row_size, T init, BinaryOp op) { - using RealT = phi::funcs::Real; + using RealT = phi::dtype::Real; constexpr auto kSharedBufferSize = framework::IsComplex::value ? 4 * kThreadNumX : 2 * kThreadNumX; __shared__ RealT sbuf[kThreadNumY][kSharedBufferSize]; diff --git a/paddle/fluid/operators/math/lstm_compute.cc b/paddle/fluid/operators/math/lstm_compute.cc deleted file mode 100644 index aa4fe65a5201c2db5684ac9407a869834f0eb757..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/math/lstm_compute.cc +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/lstm_compute.h" - -#include "paddle/fluid/operators/math/detail/lstm_cpu_kernel.h" -#include "paddle/fluid/operators/math/detail/lstm_kernel.h" - -namespace paddle { -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace math { - -template -struct LstmUnitFunctor { - static void compute(const platform::CPUDeviceContext& context, - LstmMetaValue value, int frame_size, int batch_size, - T cell_clip, const detail::ActivationType& gate_act, - const detail::ActivationType& cell_act, - const detail::ActivationType& cand_act, - bool old_api_version = true) { - for (int b = 0; b < batch_size; b++) { - detail::cpu_lstm_forward(context, detail::forward::lstm(), value, - frame_size, cell_clip, cand_act, gate_act, - cell_act, old_api_version); - value.gate_value += frame_size * 4; - value.state_value += frame_size; - value.state_active_value += frame_size; - value.output_value += frame_size; - if (value.prev_state_value) { - value.prev_state_value += frame_size; - } - } - } -}; - -template -struct LstmUnitGradFunctor { - static void compute(const platform::CPUDeviceContext& context, - LstmMetaValue value, LstmMetaGrad grad, - int frame_size, int batch_size, T cell_clip, - const detail::ActivationType& gate_act, - const detail::ActivationType& cell_act, - const detail::ActivationType& cand_act, - bool old_api_version = true) { - for (int b = 0; b < batch_size; b++) { - detail::cpu_lstm_backward(context, detail::backward::lstm(), value, - grad, frame_size, cell_clip, cand_act, gate_act, - cell_act, old_api_version); - - value.gate_value += frame_size * 4; - value.state_value += frame_size; - value.state_active_value += frame_size; - value.output_value += frame_size; - if (value.prev_state_value) { - value.prev_state_value += frame_size; - } - - grad.gate_grad += frame_size * 4; - grad.state_grad += frame_size; - grad.state_active_grad += frame_size; - grad.output_grad += frame_size; - if (grad.prev_state_grad) { - grad.prev_state_grad += frame_size; - } - } - } -}; - -template class LstmUnitFunctor; -template class LstmUnitFunctor; -template class LstmUnitGradFunctor; -template class LstmUnitGradFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/lstm_compute.cu b/paddle/fluid/operators/math/lstm_compute.cu deleted file mode 100644 index 4342cb7b79928eb19901a1efa084a3d1d1fbda43..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/math/lstm_compute.cu +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/detail/lstm_gpu_kernel.h" -#include "paddle/fluid/operators/math/detail/lstm_kernel.h" -#include "paddle/fluid/operators/math/lstm_compute.h" - -namespace paddle { -namespace operators { -namespace math { - -template -struct LstmUnitFunctor { - static void compute(const platform::CUDADeviceContext& context, - LstmMetaValue value, int frame_size, int batch_size, - T cell_clip, const detail::ActivationType& gate_act, - const detail::ActivationType& cell_act, - const detail::ActivationType& cand_act, - bool old_api_version = true) { - detail::gpu_lstm_forward(context, detail::forward::lstm(), value, - frame_size, batch_size, cell_clip, cand_act, - gate_act, cell_act); - } -}; - -template -struct LstmUnitGradFunctor { - static void compute(const platform::CUDADeviceContext& context, - LstmMetaValue value, LstmMetaGrad grad, - int frame_size, int batch_size, T cell_clip, - const detail::ActivationType& gate_act, - const detail::ActivationType& cell_act, - const detail::ActivationType& cand_act, - bool old_api_version = true) { - detail::gpu_lstm_backward(context, detail::backward::lstm(), value, grad, - frame_size, batch_size, cell_clip, cand_act, - gate_act, cell_act); - } -}; - -template class LstmUnitFunctor; -template class LstmUnitFunctor; -template class LstmUnitGradFunctor; -template class LstmUnitGradFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/matrix_inverse.cc b/paddle/fluid/operators/math/matrix_inverse.cc deleted file mode 100644 index 1b36e615c68df814015a2c308ed74b755f6bc635..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/math/matrix_inverse.cc +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/matrix_inverse.h" -#include "Eigen/Core" -#include "Eigen/LU" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace operators { -namespace math { - -template -class MatrixInverseFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& a, framework::Tensor* a_inv) { - compute_inverse_eigen(context, a, a_inv); - } -}; - -template class MatrixInverseFunctor; -template class MatrixInverseFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc deleted file mode 100644 index 41335a69417a94a567119bb8f37378af957be541..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/math/matrix_inverse.cu.cc +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/matrix_inverse.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace platform { -class CUDADeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace math { - -template -class MatrixInverseFunctor; - -template -class MatrixInverseFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& a, framework::Tensor* a_inv) { -#ifndef PADDLE_WITH_HIP - const auto& mat_dims = a.dims(); - const int rank = mat_dims.size(); - int n = mat_dims[rank - 1]; - int batch_size = rank > 2 ? a.numel() / (n * n) : 1; - - memory::allocation::AllocationPtr tmp_gpu_mat_data; - const T* gpu_mat = a.data(); - if (n >= 32) { - // Copy all elements of input matrix A to a temporary memory space to - // avoid being overriden by getrf. - tmp_gpu_mat_data = memory::Alloc(context, a.numel() * sizeof(T)); - memory::Copy(context.GetPlace(), tmp_gpu_mat_data->ptr(), - context.GetPlace(), a.data(), a.numel() * sizeof(T), - context.stream()); - gpu_mat = reinterpret_cast(tmp_gpu_mat_data->ptr()); - } - - std::vector cpu_ptrs(batch_size * 2); - for (int i = 0; i < batch_size; ++i) { - cpu_ptrs[i] = gpu_mat + i * n * n; - cpu_ptrs[i + batch_size] = a_inv->data() + i * n * n; - } - - // Copy the addresses of A and A_inv from host to device. - memory::allocation::AllocationPtr tmp_gpu_ptrs_data = - memory::Alloc(context, cpu_ptrs.size() * sizeof(T*)); - memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(), - platform::CPUPlace(), static_cast(cpu_ptrs.data()), - cpu_ptrs.size() * sizeof(T*), context.stream()); - T** gpu_inv_ptrs = - reinterpret_cast(tmp_gpu_ptrs_data->ptr()) + batch_size; - - // Allocate device memory for info and pivots. - int num_ints = n < 32 ? batch_size : batch_size * (n + 1); - memory::allocation::AllocationPtr tmp_gpu_info_data = - memory::Alloc(context, num_ints * sizeof(int)); - int* gpu_info_ptr = reinterpret_cast(tmp_gpu_info_data->ptr()); - - auto blas = phi::funcs::GetBlas(context); - - std::vector info; // only for singular checking - info.resize(batch_size); - // This functions in cuBLAS is intended to be used for matrices of small - // sizes where the launch overhead is a significant factor. - // TODO(Xreki): call function in cusolver for large matrices. - if (n < 32) { - // cublasmatinvBatched is a short cut of cublasgetrfBatched - // plus cublasgetriBatched. - // However it only works if N is less than 32. If not, we need to - // go through cublasgetrfBatched and cublasgetriBatched. - blas.BatchedMatInv(n, - reinterpret_cast(tmp_gpu_ptrs_data->ptr()), - gpu_inv_ptrs, gpu_info_ptr, batch_size); - } else { - // This function performs the LU factorization of each matrix A by the - // equation P * A = L * U. L and U are written back to original matrix A, - // and diagonal elements of L are discarded. - int* gpu_pivot_ptr = - reinterpret_cast(tmp_gpu_info_data->ptr()) + batch_size; - blas.BatchedGETRF(n, reinterpret_cast(tmp_gpu_ptrs_data->ptr()), - gpu_pivot_ptr, gpu_info_ptr, batch_size); - - blas.BatchedGETRI(n, - reinterpret_cast(tmp_gpu_ptrs_data->ptr()), - gpu_pivot_ptr, gpu_inv_ptrs, gpu_info_ptr, batch_size); - } - memory::Copy(platform::CPUPlace(), info.data(), context.GetPlace(), - gpu_info_ptr, sizeof(int) * batch_size, context.stream()); - for (int i = 0; i < batch_size; ++i) { - PADDLE_ENFORCE_EQ(info[i], 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: U(%d, %d) is zero, singular U. " - "Please check the matrix value and change it to a " - "non-singular matrix", - i, info[i], info[i])); - } -#else - compute_inverse_eigen(context, a, a_inv); -#endif - } -}; - -template class MatrixInverseFunctor; -template class MatrixInverseFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.cc b/paddle/fluid/operators/math/maxouting.cc index 45556e97d1d7afb81d626c99b078cbc215c0195f..28ec3a871022f4b9ec4dce9d9310fd630f10e473 100644 --- a/paddle/fluid/operators/math/maxouting.cc +++ b/paddle/fluid/operators/math/maxouting.cc @@ -14,106 +14,107 @@ limitations under the License. */ #include "paddle/fluid/operators/math/maxouting.h" +#include "paddle/phi/backends/cpu/cpu_context.h" + namespace paddle { namespace operators { namespace math { // All tensors are in NCHW or NHWC format, and the groups must be greater than 1 -template -class MaxOutFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, framework::Tensor* output, - const int groups, const int axis) { - const int batch_size = input.dims()[0]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output->dims()[axis]; - int fea_size = input_height * input_width; - // c_size means the output size of each sample - int c_size = fea_size * output_channels; - const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); - for (int i = 0; i < batch_size; ++i) { - int new_bindex = c_size * i; - for (int c = 0; c < output_channels; ++c) { - int new_cindex = fea_size * c; - for (int f = 0; f < fea_size; ++f) { - T ele = static_cast(-FLT_MAX); - int input_idx, output_idx; - for (int ph = 0; ph < groups; ++ph) { - if (axis == 1) { - input_idx = - (new_bindex + new_cindex) * groups + ph * fea_size + f; - } else { - input_idx = (new_bindex + f * output_channels + c) * groups + ph; - } - T x = input_data[input_idx]; - ele = ele > x ? ele : x; - } +template +void MaxOutFunctor::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* output, + const int groups, + const int axis) { + const int batch_size = input.dims()[0]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output->dims()[axis]; + int fea_size = input_height * input_width; + // c_size means the output size of each sample + int c_size = fea_size * output_channels; + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + for (int i = 0; i < batch_size; ++i) { + int new_bindex = c_size * i; + for (int c = 0; c < output_channels; ++c) { + int new_cindex = fea_size * c; + for (int f = 0; f < fea_size; ++f) { + T ele = static_cast(-FLT_MAX); + int input_idx, output_idx; + for (int ph = 0; ph < groups; ++ph) { if (axis == 1) { - output_idx = new_bindex + new_cindex + f; + input_idx = (new_bindex + new_cindex) * groups + ph * fea_size + f; } else { - output_idx = new_bindex + f * output_channels + c; + input_idx = (new_bindex + f * output_channels + c) * groups + ph; } - output_data[output_idx] = ele; + T x = input_data[input_idx]; + ele = ele > x ? ele : x; } + if (axis == 1) { + output_idx = new_bindex + new_cindex + f; + } else { + output_idx = new_bindex + f * output_channels + c; + } + output_data[output_idx] = ele; } } } -}; +} -template -class MaxOutGradFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, framework::Tensor* input_grad, - const framework::Tensor& output, - const framework::Tensor& output_grad, const int groups, - const int axis) { - const int batch_size = input.dims()[0]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output.dims()[axis]; - int fea_size = input_height * input_width; - const T* input_data = input.data(); - const T* output_data = output.data(); - const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); +template +void MaxOutGradFunctor::operator()( + const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* input_grad, const framework::Tensor& output, + const framework::Tensor& output_grad, const int groups, const int axis) { + const int batch_size = input.dims()[0]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output.dims()[axis]; + int fea_size = input_height * input_width; + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); - for (int i = 0; i < batch_size; ++i) { - int blen = fea_size * output_channels * i; - for (int c = 0; c < output_channels; ++c) { - int clen = fea_size * c; - for (int f = 0; f < fea_size; ++f) { - int input_idx0, output_idx; - bool continue_match = true; - if (axis == 1) { - input_idx0 = (blen + clen) * groups + f; - output_idx = blen + clen + f; - } else { - input_idx0 = (blen + f * output_channels + c) * groups; - output_idx = blen + f * output_channels + c; - } - for (int g = 0; g < groups && continue_match; ++g) { - int idx_offset = (axis == 1 ? fea_size * g : g); - int input_idx = input_idx0 + idx_offset; - if (input_data[input_idx] == output_data[output_idx]) { - input_grad_data[input_idx] += output_grad_data[output_idx]; - continue_match = false; - } + for (int i = 0; i < batch_size; ++i) { + int blen = fea_size * output_channels * i; + for (int c = 0; c < output_channels; ++c) { + int clen = fea_size * c; + for (int f = 0; f < fea_size; ++f) { + int input_idx0, output_idx; + bool continue_match = true; + if (axis == 1) { + input_idx0 = (blen + clen) * groups + f; + output_idx = blen + clen + f; + } else { + input_idx0 = (blen + f * output_channels + c) * groups; + output_idx = blen + f * output_channels + c; + } + for (int g = 0; g < groups && continue_match; ++g) { + int idx_offset = (axis == 1 ? fea_size * g : g); + int input_idx = input_idx0 + idx_offset; + if (input_data[input_idx] == output_data[output_idx]) { + input_grad_data[input_idx] += output_grad_data[output_idx]; + continue_match = false; } } } } } -}; +} template class MaxOutGradFunctor; template class MaxOutGradFunctor; template class MaxOutFunctor; template class MaxOutFunctor; +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; +template class MaxOutFunctor; +template class MaxOutFunctor; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu index 1856fb4eb48c73f96d4f6428ba890c821a61292c..1d0478db5ef4a80d955d1166ffa21ff39f6bd184 100644 --- a/paddle/fluid/operators/math/maxouting.cu +++ b/paddle/fluid/operators/math/maxouting.cu @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/maxouting.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace paddle { namespace operators { @@ -95,61 +96,57 @@ __global__ void KernelMaxoutGrad(const int nthreads, const T* input_data, /* * All tensors are in NCHW or NHWC format. */ -template -class MaxOutFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, framework::Tensor* output, - const int groups, const int axis) { - const int batch_size = input.dims()[0]; - const int input_channels = input.dims()[axis]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output->dims()[axis]; - - const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); - int nthreads = output->numel(); - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelMaxOut<<>>( - nthreads, input_data, input_channels, input_height, input_width, groups, - axis, output_data); - } -}; +template +void MaxOutFunctor::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* output, + const int groups, + const int axis) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[axis]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output->dims()[axis]; + + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + int nthreads = output->numel(); + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxOut<<>>( + nthreads, input_data, input_channels, input_height, input_width, groups, + axis, output_data); +} + /* * All tensors are in NCHW or NHWC format. */ -template -class MaxOutGradFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, framework::Tensor* input_grad, - const framework::Tensor& output, - const framework::Tensor& output_grad, const int groups, - const int axis) { - const int batch_size = input.dims()[0]; - const int input_channels = input.dims()[axis]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output.dims()[axis]; - - const T* input_data = input.data(); - const T* output_data = output.data(); - const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); - int nthreads = output.numel(); - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelMaxoutGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_grad_data, - input_channels, input_height, input_width, groups, axis); - } -}; +template +void MaxOutGradFunctor::operator()( + const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* input_grad, const framework::Tensor& output, + const framework::Tensor& output_grad, const int groups, const int axis) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[axis]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output.dims()[axis]; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + int nthreads = output.numel(); + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxoutGrad<<>>( + nthreads, input_data, output_data, output_grad_data, input_grad_data, + input_channels, input_height, input_width, groups, axis); +} template class MaxOutGradFunctor; template class MaxOutGradFunctor; @@ -157,6 +154,12 @@ template class MaxOutGradFunctor; template class MaxOutFunctor; template class MaxOutFunctor; +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; + +template class MaxOutFunctor; +template class MaxOutFunctor; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h index 0d8372df8a2fec306f6091712c66d55d1e71216e..1f4964f7715426d2eab6168ae009ffbd40e1ff0a 100644 --- a/paddle/fluid/operators/math/maxouting.h +++ b/paddle/fluid/operators/math/maxouting.h @@ -30,7 +30,7 @@ class MaxOutFunctor { const int axis = 1); }; -template +template class MaxOutGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index fcd5c06a6f310f8a23608a77f2d6b9098e99b33a..5ac39953462b5078aa663a7f39f5eb95c96bae7a 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/operators/mkldnn/axpy_handler.h" @@ -502,32 +503,29 @@ struct MergeAdd { out.mutable_value()->mutable_data( phi::make_ddim({static_cast(merge_rows.size()), input_width}), context.GetPlace()); - int r = - xpu::constant(context.x_context(), out.mutable_value()->data(), - merge_rows.size() * input_width, static_cast(0.f)); - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::External("XPU constant op return" - " wrong value[%d %s].", - r, XPUAPIErrorMsg[r])); std::unordered_map rows_to_id; for (size_t i = 0; i < merge_rows.size(); ++i) { rows_to_id[merge_rows[i]] = i; } - auto* out_data = out.mutable_value()->data(); - auto* input_data = input.value().data(); + auto* y_data = out.mutable_value()->data(); + auto* x_data = input.value().data(); + int xm = input_rows.size(); + int ym = merge_rows.size(); int n = input_width; - for (size_t i = 0; i < input_rows.size(); i++) { - size_t out_i = rows_to_id[input_rows[i]]; - auto r = xpu::add(context.x_context(), &input_data[i * input_width], - &out_data[out_i * input_width], - &out_data[out_i * input_width], n); - PADDLE_ENFORCE_EQ( - r, XPU_SUCCESS, - platform::errors::External("XPU API return wrong value[%d %s], ", r, - XPUAPIErrorMsg[r])); - } + + xpu::ctx_guard RAII_GUARD(context.x_context()); + int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm(xm); + int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm(ym); + memory::Copy(context.GetPlace(), y_rows_data, platform::CPUPlace(), + merge_rows.data(), ym * sizeof(int64_t)); + memory::Copy(context.GetPlace(), x_rows_data, platform::CPUPlace(), + input_rows.data(), xm * sizeof(int64_t)); + int r = + xpu::merge_dup_rows(context.x_context(), x_data, y_data, + x_rows_data, y_rows_data, xm, n, ym); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "merge_dup_rows"); } void operator()(const platform::XPUDeviceContext& context, @@ -582,15 +580,7 @@ struct MergeAdd { {static_cast(merged_row_set.size()), input_width}), context.GetPlace()); - int r = - xpu::constant(context.x_context(), out.mutable_value()->data(), - merge_rows.size() * input_width, static_cast(0.f)); - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::External("XPU constant op return" - " wrong value[%d %s].", - r, XPUAPIErrorMsg[r])); - - float* out_data = reinterpret_cast(out.mutable_value()->data()); + float* y_data = reinterpret_cast(out.mutable_value()->data()); std::unordered_map rows_to_id; for (size_t i = 0; i < merge_rows.size(); ++i) { @@ -603,17 +593,22 @@ struct MergeAdd { } auto& input_rows = input->rows(); + auto* x_data = input->value().data(); + int xm = input_rows.size(); + int ym = merge_rows.size(); int n = input_width; - for (size_t i = 0; i < input_rows.size(); i++) { - size_t out_i = rows_to_id[input_rows[i]]; - auto r = xpu::add( - context.x_context(), input->value().data() + i * input_width, - &out_data[out_i * input_width], &out_data[out_i * input_width], n); - PADDLE_ENFORCE_EQ( - r, XPU_SUCCESS, - platform::errors::External("XPU API return wrong value[%d %s], ", r, - XPUAPIErrorMsg[r])); - } + + xpu::ctx_guard RAII_GUARD(context.x_context()); + int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm(xm); + int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm(ym); + memory::Copy(context.GetPlace(), y_rows_data, platform::CPUPlace(), + merge_rows.data(), ym * sizeof(int64_t)); + memory::Copy(context.GetPlace(), x_rows_data, platform::CPUPlace(), + input_rows.data(), xm * sizeof(int64_t)); + int r = + xpu::merge_dup_rows(context.x_context(), x_data, y_data, + x_rows_data, y_rows_data, xm, n, ym); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "merge_dup_rows"); } } }; diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index 8563d8b05b186c025ecc4c970a400765adeb0c5d..a4678550cf7bd0d4aa2759d4887dddabed5f9ba4 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -16,6 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -445,6 +446,7 @@ template struct MergeAdd; template struct MergeAdd; template struct MergeAdd; template struct MergeAdd; +template struct MergeAdd; template struct MergeAdd>; template struct MergeAdd>; diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu index fd879e9e6ffe72a2175acc2db98727f5ff39fbbb..83b124902ebb74e65af0a25e432ff6b488e5cee1 100644 --- a/paddle/fluid/operators/math/softmax.cu +++ b/paddle/fluid/operators/math/softmax.cu @@ -120,6 +120,10 @@ template class SoftmaxCUDNNFunctor; template class SoftmaxCUDNNFunctor; template class SoftmaxGradCUDNNFunctor; template class SoftmaxGradCUDNNFunctor; +#if CUDNN_VERSION_MIN(8, 1, 0) +template class SoftmaxCUDNNFunctor; +template class SoftmaxGradCUDNNFunctor; +#endif // MIOPEN do not support double #ifndef PADDLE_WITH_HIP @@ -131,6 +135,10 @@ template class SoftmaxFunctor; template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; template class SoftmaxFunctor; template class SoftmaxFunctor; template class SoftmaxFunctor; @@ -139,9 +147,13 @@ template class SoftmaxGradFunctor; template class SoftmaxGradFunctor; template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; template class SoftmaxFunctor; template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; template class SoftmaxFunctor; template class SoftmaxFunctor; template class SoftmaxFunctor; @@ -149,6 +161,7 @@ template class SoftmaxFunctor; template class SoftmaxGradFunctor; template class SoftmaxGradFunctor; template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index d51d638e0c19f43f9b0a91adbac15dffcdf14588..9833b4447ec45376e04ad520315e88568f7991d8 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -156,6 +156,65 @@ class SoftmaxEigen { } }; +template +class SoftmaxEigen { + public: + void operator()(const DeviceContext& context, const int axis_dim, + const framework::Tensor* X, framework::Tensor* Y) { + constexpr int kBatchDim = 0; + constexpr int kClassDim = 1; + constexpr int kAxisDim = 1; + + auto logits = EigenMatrix::From(*X); + auto softmax = EigenMatrix::From(*Y); + + const int batch_size = logits.dimension(kBatchDim); + const int num_classes = logits.dimension(kClassDim); + const int num_remain = num_classes / axis_dim; + + Eigen::DSizes along_axis(kAxisDim); + Eigen::DSizes batch_classes(batch_size, num_classes); + Eigen::DSizes batch_by_one(batch_size, 1); + Eigen::DSizes one_by_class(1, num_classes); + Eigen::DSizes batch_one_remain(batch_size, 1, num_remain); + Eigen::DSizes one_axis_one(1, axis_dim, 1); + Eigen::DSizes one_axis(1, axis_dim); + Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); + + // For numerical stability, logits should be shifted by maximum number along + // axis, calculate shifted_logits into softmax tensor for memory reuse. + if (num_remain == 1) { + // axis == -1, axis and class in same dimension, calculate along + // class dimension directly for higher performance + softmax.device(*context.eigen_device()) = + (logits - + logits.maximum(along_axis) + .reshape(batch_by_one) + .broadcast(one_by_class)) + .unaryExpr(ValueClip()); + } else { + // axis != -1, class dimension split into (axis, remain), max and sum + // should be calculated along axis dimension + softmax.device(*context.eigen_device()) = + (logits.reshape(batch_axis_remain) - + logits.reshape(batch_axis_remain) + .maximum(along_axis) + .reshape(batch_one_remain) + .broadcast(one_axis_one) + .reshape(batch_classes)) + .unaryExpr(ValueClip()); + } + + softmax.device(*context.eigen_device()) = softmax.exp(); + softmax.device(*context.eigen_device()) = + (softmax * + softmax.reshape(batch_axis_remain) + .sum(along_axis) + .inverse() + .broadcast(one_axis)); + } +}; + template void SoftmaxFunctor::operator()( const DeviceContext& context, const int axis_dim, @@ -289,6 +348,38 @@ class SoftmaxGradEigen { } }; +template +class SoftmaxGradEigen { + public: + void operator()(const DeviceContext& context, const int axis_dim, + const framework::Tensor* y, const framework::Tensor* y_grad, + framework::Tensor* x_grad) { + auto softmax = EigenMatrix::From(*y); + auto softmax_grad = EigenMatrix::From(*y_grad); + auto logits_grad = EigenMatrix::From(*x_grad); + + constexpr int kBatchDim = 0; + constexpr int kClassDim = 1; + + const int batch_size = softmax.dimension(kBatchDim); + const int num_classes = softmax.dimension(kClassDim); + const int num_remain = num_classes / axis_dim; + + Eigen::DSizes along_class(kClassDim); + Eigen::DSizes batch_by_one(batch_size, 1); + Eigen::DSizes one_by_class(1, num_classes); + Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); + Eigen::DSizes one_axis(1, axis_dim); + + auto dot = (softmax * softmax_grad) + .reshape(batch_axis_remain) + .sum(along_class) + .broadcast(one_axis); + logits_grad.device(*context.eigen_device()) = + (softmax_grad - dot) * softmax; + } +}; + template void SoftmaxGradFunctor::operator()( const DeviceContext& context, const int axis_dim, diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc index 42bf1f471deb5238fdb34dcd9284972930305f58..bc5a589ed6fb137c5013253a65971dcf80d4ac72 100644 --- a/paddle/fluid/operators/math/vol2col.cc +++ b/paddle/fluid/operators/math/vol2col.cc @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/fluid/operators/math/vol2col.h" +#include "paddle/phi/backends/cpu/cpu_context.h" + namespace paddle { namespace platform { class CPUDeviceContext; @@ -141,6 +143,116 @@ class Vol2ColFunctor { } }; +template +class Vol2ColFunctor { + public: + void operator()(const phi::CPUContext& context, const framework::Tensor& vol, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* col, + const DataLayout data_layout) const { + PADDLE_ENFORCE_EQ(vol.dims().size(), 4, + platform::errors::InvalidArgument( + "The dimension of vol should be 4, but received %d.", + vol.dims().size())); + + PADDLE_ENFORCE_EQ(col->dims().size(), 7, + platform::errors::InvalidArgument( + "The dimension of col should be 7, but received %d.", + col->dims().size())); + + int input_channels = + (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]); + int input_depth = + (data_layout != DataLayout::kNHWC ? vol.dims()[1] : vol.dims()[0]); + int input_height = + (data_layout != DataLayout::kNHWC ? vol.dims()[2] : vol.dims()[1]); + int input_width = + (data_layout != DataLayout::kNHWC ? vol.dims()[3] : vol.dims()[2]); + int filter_depth = col->dims()[1]; + int filter_height = col->dims()[2]; + int filter_width = col->dims()[3]; + int output_depth = col->dims()[4]; + int output_height = col->dims()[5]; + int output_width = col->dims()[6]; + int channels_col = + input_channels * filter_depth * filter_height * filter_width; + + // changed + bool paddings_size_is_6 = (paddings.size() == 6); + int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0]; + int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0]; + int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1]; + int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1]; + int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2]; + int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2]; + + auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1; + PADDLE_ENFORCE_EQ( + input_depth_tmp, output_depth, + platform::errors::InvalidArgument( + "input_depth(%d) and output_depth(%d) are mismatching.", + input_depth_tmp, output_depth)); + auto input_height_tmp = (input_height + pad_h_up + pad_h_down - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1; + PADDLE_ENFORCE_EQ( + input_height_tmp, output_height, + platform::errors::InvalidArgument( + "input_height(%d) and output_height(%d) are mismatching.", + input_height_tmp, output_height)); + auto input_width_tmp = (input_width + pad_w_left + pad_w_right - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1; + PADDLE_ENFORCE_EQ( + input_width_tmp, output_width, + platform::errors::InvalidArgument( + "input_width(%d) and output_width(%d) are mismatching.", + input_width_tmp, output_width)); + const T* vol_data = vol.data(); + T* col_data = col->data(); + + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int d_offset = (c / filter_width / filter_height) % filter_depth; + int c_in = c / filter_width / filter_height / filter_depth; + for (int d = 0; d < output_depth; ++d) { + int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0]; + for (int h = 0; h < output_height; ++h) { + int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1]; + for (int w = 0; w < output_width; ++w) { + int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2]; + + int col_idx = + ((c * output_depth + d) * output_height + h) * output_width + w; + int vol_idx; + if (data_layout != DataLayout::kNHWC) { + vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) * + input_width + + w_pad; + } else { + vol_idx = ((d_pad * input_height + h_pad) * input_width + w_pad) * + input_channels + + c_in; + } + col_data[col_idx] = + (h_pad < 0 || h_pad >= input_height || w_pad < 0 || + w_pad >= input_width || d_pad < 0 || d_pad >= input_depth) + ? static_cast(0) + : vol_data[vol_idx]; + } + } + } + } + } +}; + /* * vol = [input_channels,input_depth, input_height, input_width] * col = @@ -258,10 +370,125 @@ class Col2VolFunctor { } }; +template +class Col2VolFunctor { + public: + void operator()(const phi::CPUContext& context, const framework::Tensor& col, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* vol, + const DataLayout data_layout) const { + PADDLE_ENFORCE_EQ(vol->dims().size(), 4, + platform::errors::InvalidArgument( + "The dimension of vol should be 4, but received %d.", + vol->dims().size())); + + PADDLE_ENFORCE_EQ(col.dims().size(), 7, + platform::errors::InvalidArgument( + "The dimension of col should be 7, but received %d.", + col.dims().size())); + + int input_channels = + (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]); + int input_depth = + (data_layout != DataLayout::kNHWC ? vol->dims()[1] : vol->dims()[0]); + int input_height = + (data_layout != DataLayout::kNHWC ? vol->dims()[2] : vol->dims()[1]); + int input_width = + (data_layout != DataLayout::kNHWC ? vol->dims()[3] : vol->dims()[2]); + int filter_depth = col.dims()[1]; + int filter_height = col.dims()[2]; + int filter_width = col.dims()[3]; + int output_depth = col.dims()[4]; + int output_height = col.dims()[5]; + int output_width = col.dims()[6]; + int channels_col = + input_channels * filter_depth * filter_height * filter_width; + + bool paddings_size_is_6 = (paddings.size() == 6); + int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0]; + int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0]; + int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1]; + int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1]; + int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2]; + int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2]; + + auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1; + PADDLE_ENFORCE_EQ( + input_depth_tmp, output_depth, + platform::errors::InvalidArgument( + "input_depth(%d) and output_depth(%d) are mismatching.", + input_depth_tmp, output_depth)); + auto input_height_tmp = (input_height + pad_h_up + pad_h_down - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1; + PADDLE_ENFORCE_EQ( + input_height_tmp, output_height, + platform::errors::InvalidArgument( + "input_height(%d) and output_height(%d) are mismatching.", + input_height_tmp, output_height)); + auto input_width_tmp = (input_width + pad_w_left + pad_w_right - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1; + PADDLE_ENFORCE_EQ( + input_width_tmp, output_width, + platform::errors::InvalidArgument( + "input_width(%d) and output_width(%d) are mismatching.", + input_width_tmp, output_width)); + T* vol_data = vol->data(); + const T* col_data = col.data(); + + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int d_offset = (c / filter_width / filter_height) % filter_depth; + int cIm = c / filter_width / filter_height / filter_depth; + for (int d = 0; d < output_depth; ++d) { + int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0]; + for (int h = 0; h < output_height; ++h) { + int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1]; + for (int w = 0; w < output_width; ++w) { + int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2]; + + if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 && + w_pad < input_width && d_pad >= 0 && d_pad < input_depth) { + int vol_idx; + if (data_layout != DataLayout::kNHWC) { + vol_idx = ((cIm * input_depth + d_pad) * input_height + h_pad) * + input_width + + w_pad; + } else { + vol_idx = + ((d_pad * input_height + h_pad) * input_width + w_pad) * + input_channels + + cIm; + } + int col_idx = + ((c * output_depth + d) * output_height + h) * output_width + + w; + vol_data[vol_idx] += col_data[col_idx]; + } + } + } + } + } + } +}; + template class Vol2ColFunctor; template class Vol2ColFunctor; +template class Vol2ColFunctor; +template class Vol2ColFunctor; + template class Col2VolFunctor; template class Col2VolFunctor; +template class Col2VolFunctor; +template class Col2VolFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/matmul_op_mlu.cc b/paddle/fluid/operators/matmul_op_mlu.cc new file mode 100644 index 0000000000000000000000000000000000000000..d0c84c4751e78e6bd02c4a988a7d3558962a0de5 --- /dev/null +++ b/paddle/fluid/operators/matmul_op_mlu.cc @@ -0,0 +1,337 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +static void Mul(const framework::ExecutionContext& ctx, const Tensor& X, + const Tensor& Y, Tensor* Out, const float alpha) { + Out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + + MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType(), + CNNL_NOT_PROPAGATE_NAN); + MLUCnnl::OpTensor(ctx, mul_op_desc.get(), x_desc.get(), GetBasePtr(&X), + y_desc.get(), GetBasePtr(&Y), out_desc.get(), + GetBasePtr(Out), ToCnnlDataType(), alpha); +} + +template +static void MatMul2D(const framework::ExecutionContext& ctx, const Tensor& X, + const Tensor& Y, Tensor* Out, const bool trans_x, + const bool trans_y, const float alpha) { + Out->mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_LT(fabs(alpha - 1.0), std::numeric_limits::epsilon(), + platform::errors::InvalidArgument( + "MLU(matmul): alpha should be equal to 1.0! " + "Other values are not supported yet." + "But received alpha is %d.", + alpha)); + + MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + + MLUCnnl::Matmul(ctx, trans_x, trans_y, x_desc.get(), GetBasePtr(&X), + y_desc.get(), GetBasePtr(&Y), out_desc.get(), + GetBasePtr(Out)); +} + +template +static void MatMulND(const framework::ExecutionContext& ctx, const Tensor& X, + const Tensor& Y, Tensor* Out, const bool trans_x, + const bool trans_y, const float alpha) { + if (!Out->initialized()) { + Out->mutable_data(ctx.GetPlace()); + } + + PADDLE_ENFORCE_LT(fabs(alpha - 1.0), std::numeric_limits::epsilon(), + platform::errors::InvalidArgument( + "MLU(matmul): alpha should be equal to 1.0! " + "Other values are not supported yet." + "But received alpha is %d.", + alpha)); + + MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + + MLUCnnl::BatchMatmul(ctx, trans_x, trans_y, x_desc.get(), GetBasePtr(&X), + y_desc.get(), GetBasePtr(&Y), out_desc.get(), + GetBasePtr(Out)); +} + +template +static void ReduceDims(const framework::ExecutionContext& ctx, + const std::vector& dims, + const std::vector& bcast_dims, const Tensor& in, + Tensor* out) { + std::vector axes; + int64_t size = bcast_dims.size(); + int64_t diff = bcast_dims.size() - dims.size(); + for (int64_t i = 0; i < size; ++i) { + if (i < diff) { + axes.push_back(i); + continue; + } + if (bcast_dims[i] > dims[i - diff]) { + axes.push_back(i); + } + } + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc in_desc(in, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + + std::vector reduce_dims(axes.begin(), axes.end()); + MLUCnnlReduceDesc reduce_desc(reduce_dims, CNNL_REDUCE_ADD, + ToCnnlDataType(), CNNL_NOT_PROPAGATE_NAN, + CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES); + + MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduce_desc.get(), nullptr, + in_desc.get(), GetBasePtr(&in), 0 /*indices_size*/, nullptr, + nullptr, out_desc.get(), GetBasePtr(out)); +} + +template +class MatMulMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* X = ctx.Input("X"); + auto* Y = ctx.Input("Y"); + auto* Out = ctx.Output("Out"); + bool transpose_x = ctx.Attr("transpose_X"); + bool transpose_y = ctx.Attr("transpose_Y"); + float alpha = static_cast(ctx.Attr("alpha")); + + std::vector x_dims = phi::vectorize(X->dims()); + std::vector y_dims = phi::vectorize(Y->dims()); + std::vector out_dims = phi::vectorize(Out->dims()); + int x_ndim = x_dims.size(); + int y_ndim = y_dims.size(); + + // Case 1: [K] x [K] = [1] + // Equal: [1, K] x [K, 1] = [1, 1] => [1] + const bool all_one_dim = (x_ndim == 1 && y_ndim == 1); + if (all_one_dim) { + Out->Resize({1, 1}); + } + + // Resize dim 1 to 2 + Tensor x_temp, y_temp; + x_temp.ShareDataWith(*X); + y_temp.ShareDataWith(*Y); + if (x_ndim == 1) { + x_dims.insert(x_dims.begin(), 1); + x_temp.Resize(phi::make_ddim(x_dims)); + x_ndim = 2; + // matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim` + if (out_dims.size() < y_dims.size()) { + std::vector temp_out_dims(out_dims.begin(), out_dims.end()); + temp_out_dims.insert(temp_out_dims.end() - 1, 1); + Out->Resize(phi::make_ddim(temp_out_dims)); + } + } + if (y_ndim == 1) { + y_dims.push_back(1); + y_temp.Resize(phi::make_ddim(y_dims)); + y_ndim = 2; + // matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim` + if (out_dims.size() < x_dims.size()) { + std::vector temp_out_dims(out_dims.begin(), out_dims.end()); + temp_out_dims.push_back(1); + Out->Resize(phi::make_ddim(temp_out_dims)); + } + } + + const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; + if (transpose_y) { + PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K, + platform::errors::InvalidArgument( + "Input(Y) has error dim." + "Y'dims[%d] must be equal to %d" + "But received Y'dims[%d] is %d", + y_ndim - 1, K, y_ndim - 1, y_dims[y_ndim - 1])); + } else { + PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K, + platform::errors::InvalidArgument( + "Input(Y) has error dim." + "Y'dims[%d] must be equal to %d" + "But received Y'dims[%d] is %d", + y_ndim - 2, K, y_ndim - 2, y_dims[y_ndim - 2])); + } + + if (x_ndim == 2 && y_ndim == 2) { + // Case 2: [M, K] x [K, N] = [M, N] + MatMul2D(ctx, x_temp, y_temp, Out, transpose_x, transpose_y, alpha); + } else { + // Case 3: [B, M, K] x [K, N] = [B, M, N] + // Case 4: [B, M, K] x [B, K, N] = [B, M, N] + MatMulND(ctx, x_temp, y_temp, Out, transpose_x, transpose_y, alpha); + } + + if (phi::vectorize(Out->dims()) != out_dims) { + Out->Resize(phi::make_ddim(out_dims)); + } + } +}; + +template +class MatMulGradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* X = ctx.Input("X"); + auto* Y = ctx.Input("Y"); + auto* dOut = ctx.Input(framework::GradVarName("Out")); + auto* dX = ctx.Output(framework::GradVarName("X")); + auto* dY = ctx.Output(framework::GradVarName("Y")); + bool transpose_x = ctx.Attr("transpose_X"); + bool transpose_y = ctx.Attr("transpose_Y"); + float alpha = static_cast(ctx.Attr("alpha")); + + std::vector x_dims = phi::vectorize(X->dims()); + std::vector y_dims = phi::vectorize(Y->dims()); + std::vector out_dims = phi::vectorize(dOut->dims()); + int x_ndim = x_dims.size(); + int y_ndim = y_dims.size(); + int out_ndim = out_dims.size(); + + // Case 1: [K] x [K] = [1] + if (x_ndim == 1 && y_ndim == 1) { + if (dX) { + Mul(ctx, *dOut, *Y, dX, alpha); + } + if (dY) { + Mul(ctx, *dOut, *X, dY, alpha); + } + return; + } + + // Resize dim 1 to 2 + Tensor x_temp, y_temp, dout_temp; + x_temp.ShareDataWith(*X); + y_temp.ShareDataWith(*Y); + dout_temp.ShareDataWith(*dOut); + if (x_ndim == 1) { + x_dims.insert(x_dims.begin(), 1); + out_dims.insert(out_dims.end() - 1, 1); + x_temp.Resize(phi::make_ddim(x_dims)); + dout_temp.Resize(phi::make_ddim(out_dims)); + x_ndim = 2; + out_ndim += 1; + } + if (y_ndim == 1) { + y_dims.push_back(1); + out_dims.push_back(1); + y_temp.Resize(phi::make_ddim(y_dims)); + dout_temp.Resize(phi::make_ddim(out_dims)); + y_ndim = 2; + out_ndim += 1; + } + + // Case 2: [M, K] x [K, N] = [M, N] + if (out_ndim == 2) { + if (dX) { + dX->Resize(phi::make_ddim(x_dims)); + if (transpose_x) { + MatMul2D(ctx, y_temp, dout_temp, dX, transpose_y, true, alpha); + } else { + MatMul2D(ctx, dout_temp, y_temp, dX, false, !transpose_y, alpha); + } + dX->Resize(X->dims()); + } + if (dY) { + dY->Resize(phi::make_ddim(y_dims)); + if (transpose_y) { + MatMul2D(ctx, dout_temp, x_temp, dY, true, transpose_x, alpha); + } else { + MatMul2D(ctx, x_temp, dout_temp, dY, !transpose_x, false, alpha); + } + dY->Resize(Y->dims()); + } + return; + } + + // Case 3: [B, M, K] x [K, N] = [B, M, N] + // Case 4: [B, M, K] x [B, K, N] = [B, M, N] + std::vector x_bcast_dims(out_ndim, 1); + std::vector y_bcast_dims(out_ndim, 1); + std::copy(out_dims.begin(), out_dims.end() - 2, x_bcast_dims.begin()); + std::copy(out_dims.begin(), out_dims.end() - 2, y_bcast_dims.begin()); + std::copy(x_dims.end() - 2, x_dims.end(), x_bcast_dims.end() - 2); + std::copy(y_dims.end() - 2, y_dims.end(), y_bcast_dims.end() - 2); + + if (dX) { + Tensor dx_temp(X->type()); + if (x_dims != x_bcast_dims) { + dx_temp.Resize(phi::make_ddim(x_bcast_dims)); + } else { + dX->mutable_data(ctx.GetPlace()); + dx_temp.ShareDataWith(*dX); + } + + if (transpose_x) { + MatMulND(ctx, y_temp, dout_temp, &dx_temp, transpose_y, true, alpha); + } else { + MatMulND(ctx, dout_temp, y_temp, &dx_temp, false, !transpose_y, + alpha); + } + + if (x_dims != x_bcast_dims) { + ReduceDims(ctx, x_dims, x_bcast_dims, dx_temp, dX); + } + } + + if (dY) { + Tensor dy_temp(Y->type()); + if (y_dims != y_bcast_dims) { + dy_temp.Resize(phi::make_ddim(y_bcast_dims)); + } else { + dY->mutable_data(ctx.GetPlace()); + dy_temp.ShareDataWith(*dY); + } + + if (transpose_y) { + MatMulND(ctx, dout_temp, x_temp, &dy_temp, true, transpose_x, alpha); + } else { + MatMulND(ctx, x_temp, dout_temp, &dy_temp, !transpose_x, false, + alpha); + } + + if (y_dims != y_bcast_dims) { + ReduceDims(ctx, y_dims, y_bcast_dims, dy_temp, dY); + } + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(matmul, ops::MatMulMLUKernel, + ops::MatMulMLUKernel); +REGISTER_OP_MLU_KERNEL(matmul_grad, ops::MatMulGradMLUKernel, + ops::MatMulGradMLUKernel); diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc index 788dbb2204109dd4f215730e4234e3fec8aef702..01fa01e3c6ed04c151f709dd5fbebe387c32bde3 100644 --- a/paddle/fluid/operators/matmul_v2_op.cc +++ b/paddle/fluid/operators/matmul_v2_op.cc @@ -524,8 +524,8 @@ REGISTER_OPERATOR(matmul_v2, ops::MatMulV2Op, ops::MatMulV2OpMaker, ops::MatMulV2GradOpMaker, ops::MatMulV2GradOpMaker); -DELCARE_INFER_SHAPE_FUNCTOR(matmul_v2_grad, MatMulV2GradInferShapeFunctor, - PT_INFER_META(phi::GeneralBinaryGradInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(matmul_v2_grad, MatMulV2GradInferShapeFunctor, + PD_INFER_META(phi::GeneralBinaryGradInferMeta)); REGISTER_OPERATOR(matmul_v2_grad, ops::MatMulV2OpGrad, ops::MatMulV2OpDoubleGradMaker, ops::MatMulV2OpDoubleGradMaker, diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc index 1524a50f1ac6d6afa67722bc5d1c16a581395bb2..87df75ac465042a0f7894abecb4be4c213e5d807 100644 --- a/paddle/fluid/operators/matmul_v2_op_xpu.cc +++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc @@ -38,7 +38,7 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out, auto mat_dim_b = phi::funcs::CreateMatrixDescriptor( ColumnMatrixFromVector(y_dims), 0, trans_y); - if (x_dims.size() == 3 && y_dims.size() <= 2) { + if (x_dims.size() >= 3 && y_dims.size() <= 2) { // if transpose_X is true, the transpose cost much time if (!trans_x) { mat_dim_a.height_ *= mat_dim_a.batch_size_; diff --git a/paddle/fluid/operators/matrix_power_op.cc b/paddle/fluid/operators/matrix_power_op.cc index c65af3129f3646163925be95b27b9fec25207f8c..cdf204628b638f877c92e35a8941487aa39b5427 100644 --- a/paddle/fluid/operators/matrix_power_op.cc +++ b/paddle/fluid/operators/matrix_power_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/matrix_power_op.h" +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" namespace paddle { namespace operators { @@ -119,13 +122,3 @@ REGISTER_OPERATOR(matrix_power, ops::MatrixPowerOp, ops::MatrixPowerOpMaker, ops::MatrixPowerGradOpMaker); REGISTER_OPERATOR(matrix_power_grad, ops::MatrixPowerGradOp); - -REGISTER_OP_CPU_KERNEL( - matrix_power, - ops::MatrixPowerKernel, - ops::MatrixPowerKernel); - -REGISTER_OP_CPU_KERNEL( - matrix_power_grad, - ops::MatrixPowerGradKernel, - ops::MatrixPowerGradKernel); diff --git a/paddle/fluid/operators/matrix_power_op.h b/paddle/fluid/operators/matrix_power_op.h deleted file mode 100644 index d2c67d80b4f5a562d47e56173ecf1ea2f99bff56..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/matrix_power_op.h +++ /dev/null @@ -1,277 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/math/matrix_inverse.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -struct IdentityMatrixFunctor { - IdentityMatrixFunctor(const int m, T* output) : m_(m), output_(output) {} - - HOSTDEVICE void operator()(size_t index) const { - const int row = index / m_ % m_; - const int col = index % m_; - output_[index] = col == row ? static_cast(1) : static_cast(0); - } - - const int m_; - T* output_; -}; - -template -void MatrixPowerFunction(const Tensor* X, const int n, Tensor* Out, - const paddle::framework::ExecutionContext& ctx) { - const auto& x_dims = X->dims(); - const int x_ndim = x_dims.size(); - T* out_data = Out->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.template device_context(); - platform::ForRange for_range(dev_ctx, X->numel()); - - if (n == 0) { - // Out = Identity Matrix - IdentityMatrixFunctor functor(x_dims[x_ndim - 1], out_data); - for_range(functor); - return; - } - - auto blas = phi::funcs::GetBlas(dev_ctx); - - Tensor new_x = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - int new_n = n; - if (n > 0) { - // newX = X - framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x); - } else { - // newX = X^{-1}, n = -n - math::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, *X, &new_x); - new_n = -n; - } - - if (new_n == 1) { - framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, Out); - return; - } - - auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false); - - if (new_n == 2) { - // Out = newX * newX - Out->mutable_data(ctx.GetPlace()); - blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast(1), - Out, static_cast(0)); - return; - } else if (new_n == 3) { - // Out = (newX * newX) * newX - // Note: C[i] matrices in MatMul must not overlap, i.e. the individual - // gemm operations must be computable independently; otherwise, - // undefined behavior is expected. - Tensor temp = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast(1), - &temp, static_cast(0)); - blas.MatMul(temp, no_trans_desc, new_x, no_trans_desc, static_cast(1), - Out, static_cast(0)); - return; - } else if (new_n == 4) { - // Out = (newX * newX) * (newX * newX) - Tensor temp = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast(1), - &temp, static_cast(0)); - blas.MatMul(temp, no_trans_desc, temp, no_trans_desc, static_cast(1), - Out, static_cast(0)); - return; - } - - // Calculate Out = newX^{n} for abs(n) > 4 with time complexity as O(logN) - int bit = 0; - Tensor z = Tensor(X->dtype()); - bool out_inited = false; - Tensor temp_out = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - Tensor temp_z = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - while (new_n > 0) { - bit = new_n & 0x1; - new_n >>= 1; - if (z.IsInitialized()) { - blas.MatMul(z, no_trans_desc, z, no_trans_desc, static_cast(1), - &temp_z, static_cast(0)); - framework::TensorCopy(temp_z, ctx.GetPlace(), dev_ctx, &z); - } else { - z = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, &z); - } - if (bit == 1) { - if (out_inited == true) { - blas.MatMul(*Out, no_trans_desc, z, no_trans_desc, static_cast(1), - &temp_out, static_cast(0)); - framework::TensorCopy(temp_out, ctx.GetPlace(), dev_ctx, Out); - } else { - framework::TensorCopy(z, ctx.GetPlace(), dev_ctx, Out); - out_inited = true; - } - } - } - return; -} - -template -class MatrixPowerKernel : public framework::OpKernel { - public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { - const Tensor* X = ctx.Input("X"); - Tensor* Out = ctx.Output("Out"); - int n = ctx.Attr("n"); - - const auto& x_dims = X->dims(); - const int x_ndim = x_dims.size(); - PADDLE_ENFORCE_EQ( - x_dims[x_ndim - 2], x_dims[x_ndim - 1], - platform::errors::InvalidArgument( - "The inner-most 2 dimensions of Input(X) should be equal." - "X's shape[-2] = %d and shape[-1] = %d.", - x_dims[x_ndim - 2], x_dims[x_ndim - 1])); - - MatrixPowerFunction(X, n, Out, ctx); - } -}; - -template -void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out, - const Tensor* dOut, const int n, Tensor* dX, - const paddle::framework::ExecutionContext& ctx) { - dX->mutable_data(ctx.GetPlace()); - const auto& x_dims = X->dims(); - - auto& dev_ctx = ctx.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); - - if (n == 0) { - // \nabla X = O - phi::funcs::SetConstant zero; - zero(dev_ctx, dX, static_cast(0)); - return; - } else if (n == 1) { - // \nabla X = \nabla Out - framework::TensorCopy(*dOut, ctx.GetPlace(), dev_ctx, dX); - return; - } - - auto trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, true); - auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false); - - if (n == -1) { - // \nabla X = Out^{T} * \nabla Out * Out^{T} - Tensor temp_dx = - ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(*Out, trans_desc, *dOut, no_trans_desc, static_cast(-1), - &temp_dx, static_cast(0)); - blas.MatMul(temp_dx, no_trans_desc, *Out, trans_desc, static_cast(1), dX, - static_cast(0)); - return; - } - - Tensor new_x = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - int new_n = n; - if (n > 0) { - // newX = X - framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x); - } else { - // newX = X^{-1}, n = -n - math::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, *X, &new_x); - new_n = -n; - } - - // Use chain rule blow to compute \nabla newX^{n} - // First, Get newX^{0}, newX^{1}, ..., newX^{n - 1}, - // Note that newX^{0} can be omitted - std::vector> tensor_list(new_n - 1); - tensor_list[0] = std::make_shared(new_x); - int index = 1; - while (index < new_n - 1) { - tensor_list[index] = std::make_shared( - ctx.AllocateTmpTensor(X->dims(), dev_ctx)); - blas.MatMul(*tensor_list[index - 1], no_trans_desc, new_x, no_trans_desc, - static_cast(1), tensor_list[index].get(), static_cast(0)); - index++; - } - - // Second, \nabla newX = \sum_{i = 0}^{n - 1} (newX^{T}^{i} - // * \nabla Out - // * (newX^{T}^{n - i - 1}) - Tensor dx_new = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(*tensor_list[new_n - 2], trans_desc, *dOut, no_trans_desc, - static_cast(1), &dx_new, static_cast(0)); - Tensor da_an_minus1 = - ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(*dOut, no_trans_desc, *tensor_list[new_n - 2], trans_desc, - static_cast(1), &da_an_minus1, static_cast(0)); - blas.AXPY(X->numel(), static_cast(1), da_an_minus1.data(), - dx_new.data()); - int start = 0; - while (start < new_n - 2) { - Tensor a_da = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - Tensor a_da_a = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(*tensor_list[start], trans_desc, *dOut, no_trans_desc, - static_cast(1), &a_da, static_cast(0)); - blas.MatMul(a_da, no_trans_desc, *tensor_list[new_n - 3 - start], - trans_desc, static_cast(1), &a_da_a, static_cast(0)); - blas.AXPY(X->numel(), static_cast(1), a_da_a.data(), - dx_new.data()); - start++; - } - - if (n > 0) { - // \nabla X = \nabla newX - framework::TensorCopy(dx_new, ctx.GetPlace(), dev_ctx, dX); - } else { - // \nabla X = newX^{T} * \nabla newX * newX^{T} - Tensor temp_dx = - ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(new_x, trans_desc, dx_new, no_trans_desc, static_cast(-1), - &temp_dx, static_cast(0)); - blas.MatMul(temp_dx, no_trans_desc, new_x, trans_desc, static_cast(1), - dX, static_cast(0)); - } - return; -} - -template -class MatrixPowerGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* X = ctx.Input("X"); - const Tensor* Out = ctx.Input("Out"); - const Tensor* dOut = ctx.Input(framework::GradVarName("Out")); - const int n = ctx.Attr("n"); - Tensor* dX = ctx.Output(framework::GradVarName("X")); - - MatrixPowerGradFunction(X, Out, dOut, n, dX, ctx); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/matrix_rank_op.cc b/paddle/fluid/operators/matrix_rank_op.cc index 65599259e2237387ad0dd85b5a9772733e3d7a1a..1f04875c2203b2af80aa3cb81aaf95fbb0a6fe6c 100644 --- a/paddle/fluid/operators/matrix_rank_op.cc +++ b/paddle/fluid/operators/matrix_rank_op.cc @@ -17,6 +17,7 @@ #include #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/svd_helper.h" +#include "paddle/phi/kernels/funcs/compare_functors.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -224,15 +225,15 @@ class MatrixRankCPUKernel : public framework::OpKernel { int axis = -1; if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) { - ElementwiseComputeEx, + ElementwiseComputeEx, platform::CPUDeviceContext, T, int>( context, &eigenvalue_tensor, &tol_tensor, axis, - GreaterThanFunctor(), &compare_result); + phi::funcs::GreaterThanFunctor(), &compare_result); } else { - ElementwiseComputeEx, + ElementwiseComputeEx, platform::CPUDeviceContext, T, int>( context, &eigenvalue_tensor, &tol_tensor, axis, - LessThanFunctor(), &compare_result); + phi::funcs::LessThanFunctor(), &compare_result); } auto dito_int = math::DeviceIndependenceTensorOperations { compare_result.mutable_data(detail::NewAxisDim(dim_out, k), context.GetPlace()); int axis = -1; - ElementwiseComputeEx, + ElementwiseComputeEx, platform::CUDADeviceContext, T, int64_t>( context, &eigenvalue_tensor, &tol_tensor, axis, - GreaterThanFunctor(), &compare_result); + phi::funcs::GreaterThanFunctor(), &compare_result); auto dito_int = math::DeviceIndependenceTensorOperations(context); diff --git a/paddle/fluid/operators/matrix_rank_op.h b/paddle/fluid/operators/matrix_rank_op.h index 80774aa916920dd5c828498f4345bd85ea4f33f8..93545fd31037ada823d35af5b5bad809ebf3d773 100644 --- a/paddle/fluid/operators/matrix_rank_op.h +++ b/paddle/fluid/operators/matrix_rank_op.h @@ -15,7 +15,6 @@ #pragma once #include #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/controlflow/compare_op.h" #include "paddle/phi/core/ddim.h" namespace paddle { diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc index bd9ebd29777def2fafca648ad80bc57bef8df316..e55369e0691ee5e36da76c53c6dd5d13288231f4 100644 --- a/paddle/fluid/operators/maxout_op.cc +++ b/paddle/fluid/operators/maxout_op.cc @@ -12,14 +12,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/maxout_op.h" #include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" + namespace paddle { namespace operators { -using framework::Tensor; - class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -130,10 +130,3 @@ REGISTER_OPERATOR( paddle::framework::DefaultGradOpMaker, paddle::framework::DefaultGradOpMaker); REGISTER_OPERATOR(maxout_grad, ops::MaxOutOpGrad); -REGISTER_OP_CPU_KERNEL( - maxout, ops::MaxOutKernel, - ops::MaxOutKernel); -REGISTER_OP_CPU_KERNEL( - maxout_grad, - ops::MaxOutGradKernel, - ops::MaxOutGradKernel); diff --git a/paddle/fluid/operators/maxout_op.cu.cc b/paddle/fluid/operators/maxout_op.cu.cc deleted file mode 100644 index be1e81bb869a3a5144b72ef54af22f75b2146bc5..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/maxout_op.cu.cc +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/maxout_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - maxout, ops::MaxOutKernel, - ops::MaxOutKernel); -REGISTER_OP_CUDA_KERNEL( - maxout_grad, - ops::MaxOutGradKernel, - ops::MaxOutGradKernel); diff --git a/paddle/fluid/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h deleted file mode 100644 index 922998293943ed5ee1ebcd08b5bcd93467496cb9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/maxout_op.h +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/maxouting.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class MaxOutKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - Tensor* out = context.Output("Out"); - int groups = context.template Attr("groups"); - int axis = context.template Attr("axis"); - if (axis < 0) { - axis += in_x->dims().size(); - } - - math::MaxOutFunctor maxout_forward; - maxout_forward(context.template device_context(), *in_x, out, - groups, axis); - } -}; - -template -class MaxOutGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - const Tensor* out = context.Input("Out"); - const Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - Tensor* in_x_grad = context.Output(framework::GradVarName("X")); - int groups = context.template Attr("groups"); - int axis = context.template Attr("axis"); - if (axis < 0) { - axis += in_x->dims().size(); - } - - auto& device_ctx = context.template device_context(); - phi::funcs::SetConstant zero; - if (in_x_grad) { - in_x_grad->mutable_data(context.GetPlace()); - zero(device_ctx, in_x_grad, static_cast(0.0)); - math::MaxOutGradFunctor maxout_backward; - maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups, - axis); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc index 3692ace8bb5a46b06bd10a07a5d5d95d8825bdc6..32ef052119883944abc1876f8bf3a8c028ddc57a 100644 --- a/paddle/fluid/operators/metrics/accuracy_op.cc +++ b/paddle/fluid/operators/metrics/accuracy_op.cc @@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -21,69 +23,6 @@ class AccuracyOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("Out"), true, - platform::errors::NotFound("Input (Out) of AccuracyOp is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Indices"), true, - platform::errors::NotFound( - "Input (Indices) of AccuracyOp is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Label"), true, - platform::errors::NotFound( - "Input (Label) of AccuracyOp is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Accuracy"), true, - platform::errors::NotFound( - "Output (Accuracy) of AccuracyOp is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Correct"), true, - platform::errors::NotFound( - "Output (Correct) of AccuracyOp is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Total"), true, - platform::errors::NotFound( - "Output (Total) of AccuracyOp is not found.")); - - OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "Accuracy"); - OP_INOUT_CHECK(ctx->HasInput("Indices"), "Input", "Indices", "Accuracy"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "Accuracy"); - OP_INOUT_CHECK(ctx->HasOutput("Accuracy"), "Output", "Accuracy", - "Accuracy"); - OP_INOUT_CHECK(ctx->HasOutput("Correct"), "Output", "Correct", "Accuracy"); - OP_INOUT_CHECK(ctx->HasOutput("Total"), "Output", "Total", "Accuracy"); - - auto inference_dim = ctx->GetInputDim("Out"); - auto label_dim = ctx->GetInputDim("Label"); - // Assume indices has same shape as inference, because - // it's the output of topk. - - PADDLE_ENFORCE_EQ( - label_dim.size(), 2, - platform::errors::InvalidArgument( - "ShapeError: label's dimensions of AccuracyOp must be 2. " - "But received label's dimensions = %d, label's shape = [%s]", - label_dim.size(), label_dim)); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(label_dim[1], 1, - platform::errors::InvalidArgument( - "ShapeError: label's second dimension of " - "AccuracyOp must be 1. But received label's " - "second dimension is = %d, label's shape = [%s]", - label_dim[1], label_dim)); - PADDLE_ENFORCE_EQ( - inference_dim[0], label_dim[0], - platform::errors::InvalidArgument( - "ShapeError: the output's num_rows of AccuracyOp must be" - " the same as label's num_rows. But received output's " - "shape = [%s], label's shape = [%s], output's num_rows = %d, " - "label's " - "num_rows = %d", - inference_dim, label_dim, inference_dim[0], label_dim[0])); - } - - ctx->SetOutputDim("Accuracy", {1}); - ctx->SetOutputDim("Correct", {1}); - ctx->SetOutputDim("Total", {1}); - ctx->ShareLoD("Out", /*->*/ "Accuracy"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -123,13 +62,13 @@ with the input Out(Inference). } // namespace operators } // namespace paddle +// FIXME(typhoonzero): types of T is for infernece data. +// label data is always int. +DECLARE_INFER_SHAPE_FUNCTOR(accuracy, AccuracyInferShapeFunctor, + PD_INFER_META(phi::AccuracyInferMeta)); namespace ops = paddle::operators; REGISTER_OPERATOR( accuracy, ops::AccuracyOp, ops::AccuracyOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -// FIXME(typhoonzero): types of T is for infernece data. -// label data is always int. -REGISTER_OP_CPU_KERNEL(accuracy, - ops::AccuracyKernel, - ops::AccuracyKernel); + paddle::framework::EmptyGradOpMaker, + AccuracyInferShapeFunctor); diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu deleted file mode 100644 index 6f19100fa9d37e2efedad60a982bf19b09cac736..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/metrics/accuracy_op.cu +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "paddle/fluid/operators/metrics/accuracy_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; - -template -__global__ void AccuracyCudaKernel(const int N, const int D, - const int64_t* Xdata, - const int64_t* labeldata, int* correct_data, - float* accuracy, int* total_data) { - int count = 0; - __shared__ int total[BlockSize]; - - // support only 1 block - for (int i = threadIdx.x; i < (N); i += BlockSize) { - for (int j = 0; j < D; ++j) { - if (Xdata[i * D + j] == labeldata[i]) { - ++count; - break; - } - } - } - total[threadIdx.x] = count; - __syncthreads(); - -// reduce the count with init value 0, and output accuracy. -#ifdef PADDLE_WITH_CUDA - int result = thrust::reduce(thrust::device, total, total + BlockSize, 0); -#else - // HIP thrust::reduce not support __device__ - for (int s = BlockSize / 2; s > 0; s >>= 1) { - if (threadIdx.x < s) { - total[threadIdx.x] += total[threadIdx.x + s]; - } - __syncthreads(); - } - int result = total[0]; -#endif - if (threadIdx.x == 0) { - *correct_data = result; - *accuracy = static_cast(result) / static_cast(N); - *total_data = N; - } -} - -template -class AccuracyOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* inference = ctx.Input("Out"); - auto* indices = ctx.Input("Indices"); - auto* label = ctx.Input("Label"); - - auto* accuracy = ctx.Output("Accuracy"); - auto* correct = ctx.Output("Correct"); - auto* total = ctx.Output("Total"); - // FIXME(typhoonzero): only support indices currently - // if add support for output values, how to detect the data type? - const int64_t* indices_data = indices->data(); - const int64_t* label_data = label->data(); - - int* correct_data = correct->mutable_data(ctx.GetPlace()); - int* total_data = total->mutable_data(ctx.GetPlace()); - float* accuracy_data = accuracy->mutable_data(ctx.GetPlace()); - - int num_samples = static_cast(inference->dims()[0]); - size_t infer_width = inference->dims()[1]; - auto stream = ctx.cuda_device_context().stream(); - platform::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream); - - if (num_samples == 0) { - return; - } - - AccuracyCudaKernel< - PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - num_samples, infer_width, indices_data, label_data, correct_data, - accuracy_data, total_data); - } -}; - -} // namespace operators -} // namespace paddle - -// FIXME(typhoonzero): types of T is for inference data. -// label data is always int64 -REGISTER_OP_CUDA_KERNEL( - accuracy, paddle::operators::AccuracyOpCUDAKernel, - paddle::operators::AccuracyOpCUDAKernel, - paddle::operators::AccuracyOpCUDAKernel); diff --git a/paddle/fluid/operators/metrics/accuracy_op.h b/paddle/fluid/operators/metrics/accuracy_op.h deleted file mode 100644 index 94e5bf8257e67b9fd01aa9ae45a25d90963fef13..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/metrics/accuracy_op.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class AccuracyKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* inference = ctx.Input("Out"); - auto* indices = ctx.Input("Indices"); - auto* label = ctx.Input("Label"); - auto* accuracy = ctx.Output("Accuracy"); - auto* correct = ctx.Output("Correct"); - auto* total = ctx.Output("Total"); - - int* correct_data = correct->mutable_data(ctx.GetPlace()); - int* total_data = total->mutable_data(ctx.GetPlace()); - float* accuracy_data = accuracy->mutable_data(ctx.GetPlace()); - - const int64_t* indices_data = indices->data(); - const int64_t* label_data = label->data(); - - size_t num_samples = inference->dims()[0]; - size_t class_dim = inference->dims()[1]; - *accuracy_data = 0.0f; - - if (num_samples == 0) { - return; - } - - int num_correct = 0; - // assume inference is already the topk of the output - for (size_t i = 0; i < num_samples; ++i) { - PADDLE_ENFORCE_GE( - label_data[i], 0, - platform::errors::InvalidArgument( - "label of AccuracyOp must >= 0, But received label[%d] is %d", i, - label_data[i])); - for (size_t j = 0; j < class_dim; ++j) { - if (indices_data[i * class_dim + j] == label_data[i]) { - ++num_correct; - break; - } - } - } - - *correct_data = num_correct; - *total_data = num_samples; - *accuracy_data = - static_cast(num_correct) / static_cast(num_samples); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc index 2598d3b0277c94a52e1fa14b04c00b595071f312..1ce02ff4525c9692f88ed42b79ff336cc0113c41 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc @@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc index 63bccc2e6e065a639c86a647894d2a0c124f0e54..9f2ca4165f33a28902bfe20207b12bad2af49fad 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc @@ -12,8 +12,8 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/controlflow/compare_op.h" -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc index de71312d78df99adc3b3663f2fcbb3943373982e..3cc1be4de8a82ff263824ab4852178f735596d45 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc @@ -14,12 +14,14 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" namespace paddle { namespace operators { +using Tensor = paddle::framework::Tensor; template class AccuracyXPUKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc index 2a3a0fa5d1fe50c93686c76571d812cab18c1d38..f3ed98c3f4d1e47a8b7dff81a998c7574859baa2 100644 --- a/paddle/fluid/operators/metrics/auc_op.cc +++ b/paddle/fluid/operators/metrics/auc_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/metrics/auc_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -21,70 +24,6 @@ class AucOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - protected: - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Predict"), "Input", "Predict", "Auc"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "Auc"); - auto predict_dims = ctx->GetInputDim("Predict"); - auto label_dims = ctx->GetInputDim("Label"); - PADDLE_ENFORCE_GE( - predict_dims.size(), 2, - platform::errors::InvalidArgument( - "The Input(Predict) has not been initialized properly. The " - "shape of Input(Predict) = [%s], the shape size must be " - "greater_equal 2.", - predict_dims)); - auto predict_width = predict_dims[1]; - PADDLE_ENFORCE_NE( - phi::product(predict_dims), 0, - platform::errors::InvalidArgument( - "The Input(Predict) has not been initialized properly. The " - "shape of Input(Predict) = [%s], the shape can not involes 0.", - predict_dims)); - PADDLE_ENFORCE_NE( - phi::product(label_dims), 0, - platform::errors::InvalidArgument( - "The Input(Label) has not been initialized properly. The " - "shape of Input(Label) = [%s], the shape can not involes 0.", - label_dims)); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_LE(predict_width, 2, - platform::errors::InvalidArgument( - "Only support binary classification," - "prediction dims[1] should be 1 or 2")); - } - auto predict_height = ctx->GetInputDim("Predict")[0]; - auto label_height = ctx->GetInputDim("Label")[0]; - - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(predict_height, label_height, - platform::errors::InvalidArgument( - "Out and Label should have same height.")); - } - - int num_pred_buckets = ctx->Attrs().Get("num_thresholds") + 1; - int slide_steps = ctx->Attrs().Get("slide_steps"); - - PADDLE_ENFORCE_GE( - num_pred_buckets, 1, - platform::errors::InvalidArgument("num_thresholds must larger than 1")); - PADDLE_ENFORCE_GE(slide_steps, 0, - platform::errors::InvalidArgument( - "slide_steps must be natural number")); - - ctx->SetOutputDim("AUC", {1}); - - if (slide_steps) { - ctx->SetOutputDim("StatPosOut", - {(1 + slide_steps) * num_pred_buckets + 1}); - ctx->SetOutputDim("StatNegOut", - {(1 + slide_steps) * num_pred_buckets + 1}); - } else { - ctx->SetOutputDim("StatPosOut", {1, num_pred_buckets}); - ctx->SetOutputDim("StatNegOut", {1, num_pred_buckets}); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -145,5 +84,7 @@ There are two types of possible curves: } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker); -REGISTER_OP_CPU_KERNEL(auc, ops::AucKernel); +DECLARE_INFER_SHAPE_FUNCTOR(auc, AucInferShapeFunctor, + PD_INFER_META(phi::AucInferMeta)); +REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker, + AucInferShapeFunctor); diff --git a/paddle/fluid/operators/metrics/auc_op.cu b/paddle/fluid/operators/metrics/auc_op.cu deleted file mode 100644 index 1cb7eba8775e814b1150929de4a341c466ee4583..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/metrics/auc_op.cu +++ /dev/null @@ -1,232 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/metrics/auc_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -__global__ void ClearObsoleteDataKernel(int64_t *pos, int64_t *neg, - const int bucket_length, - const int slide_steps) { - int cur_step_index = - static_cast(pos[(slide_steps + 1) * bucket_length]) % slide_steps; - int cur_step_begin = cur_step_index * bucket_length; - int sum_step_begin = slide_steps * bucket_length; - CUDA_KERNEL_LOOP(i, bucket_length) { - pos[sum_step_begin + i] -= pos[cur_step_begin + i]; - neg[sum_step_begin + i] -= neg[cur_step_begin + i]; - pos[cur_step_begin + i] = neg[cur_step_begin + i] = 0; - } -} - -__global__ void UpdateSumDataKernel(int64_t *pos, int64_t *neg, - const int bucket_length, - const int slide_steps) { - int cur_step_index = - static_cast(pos[(slide_steps + 1) * bucket_length]) % slide_steps; - int cur_step_begin = cur_step_index * bucket_length; - int sum_step_begin = slide_steps * bucket_length; - CUDA_KERNEL_LOOP(i, bucket_length) { - pos[sum_step_begin + i] += pos[cur_step_begin + i]; - neg[sum_step_begin + i] += neg[cur_step_begin + i]; - } -} - -template -__global__ void AddDataKernel(const int64_t *label_data, const T *pred_data, - const int inference_width, - const int num_thresholds, int64_t *pos, - int64_t *neg, const int numel, - const int slide_steps) { - int cur_step_begin = 0; - if (slide_steps > 0) { - int cur_step_index = - static_cast(pos[(slide_steps + 1) * (1 + num_thresholds)]) % - slide_steps; - cur_step_begin = cur_step_index * (1 + num_thresholds); - } - CUDA_KERNEL_LOOP(i, numel) { - auto predict_data = pred_data[i * inference_width + (inference_width - 1)]; - PADDLE_ENFORCE(predict_data <= 1, "The predict data must less or equal 1."); - PADDLE_ENFORCE(predict_data >= 0, - "The predict data must gather or equal 0."); - uint32_t binIdx = static_cast(predict_data * num_thresholds); - if (label_data[i]) { - paddle::platform::CudaAtomicAdd(pos + cur_step_begin + binIdx, 1); - } else { - paddle::platform::CudaAtomicAdd(neg + cur_step_begin + binIdx, 1); - } - } -} -__global__ void CalcAucKernel(int64_t *stat_pos, int64_t *stat_neg, - int num_thresholds, double *auc, - bool need_add_batch_num) { - *auc = 0.0f; - double totPos = 0.0; - double totNeg = 0.0; - double totPosPrev = 0.0; - double totNegPrev = 0.0; - - int idx = num_thresholds; - - while (idx >= 0) { - totPosPrev = totPos; - totNegPrev = totNeg; - totPos += stat_pos[idx]; - totNeg += stat_neg[idx]; - *auc += (totNeg - totNegPrev) * (totPos + totPosPrev) / 2.0; - --idx; - } - - if (totPos > 0.0 && totNeg > 0.0) { - *auc = *auc / totPos / totNeg; - } - if (need_add_batch_num) { - stat_pos[num_thresholds + 1] += 1; - stat_neg[num_thresholds + 1] += 1; - } -} - -template -class AucCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *predict = ctx.Input("Predict"); - auto *label = ctx.Input("Label"); - - int num_thresholds = ctx.Attr("num_thresholds"); - int slide_steps = ctx.Attr("slide_steps"); - - // Only use output var for now, make sure it's persistable and - // not cleaned up for each batch. - auto *auc_tensor = ctx.Output("AUC"); - auto *stat_pos = ctx.Output("StatPosOut"); - auto *stat_neg = ctx.Output("StatNegOut"); - - auto *origin_stat_pos = stat_pos->mutable_data(ctx.GetPlace()); - auto *origin_stat_neg = stat_neg->mutable_data(ctx.GetPlace()); - auto *auc_value = auc_tensor->mutable_data(ctx.GetPlace()); - - auto *stat_pos_in_tensor = ctx.Input("StatPos"); - auto *pos_in_data = stat_pos_in_tensor->data(); - auto *stat_neg_in_tensor = ctx.Input("StatNeg"); - auto *neg_in_data = stat_neg_in_tensor->data(); -#ifdef PADDLE_WITH_CUDA - if (stat_pos_in_tensor != stat_pos) { - cudaMemcpy(origin_stat_pos, pos_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t), - cudaMemcpyDeviceToDevice); - } - if (stat_neg_in_tensor != stat_neg) { - cudaMemcpy(origin_stat_neg, neg_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t), - cudaMemcpyDeviceToDevice); - } -#else - if (stat_pos_in_tensor != stat_pos) { - hipMemcpy(origin_stat_pos, pos_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t), - hipMemcpyDeviceToDevice); - } - if (stat_neg_in_tensor != stat_neg) { - hipMemcpy(origin_stat_neg, neg_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t), - hipMemcpyDeviceToDevice); - } -#endif - - statAuc(ctx, label, predict, num_thresholds, slide_steps, origin_stat_pos, - origin_stat_neg); - int sum_offset = slide_steps * (num_thresholds + 1); - auto stream = - ctx.template device_context().stream(); - CalcAucKernel<<<1, 1, 0, stream>>>( - origin_stat_pos + sum_offset, origin_stat_neg + sum_offset, - num_thresholds, auc_value, slide_steps > 0); - } - - private: - inline static double trapezoidArea(double X1, double X2, double Y1, - double Y2) { - return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0; - } - - inline static void statAuc(const framework::ExecutionContext &ctx, - const framework::Tensor *label, - const framework::Tensor *predict, - const int num_thresholds, const int slide_steps, - int64_t *origin_stat_pos, - int64_t *origin_stat_neg) { - size_t batch_size = predict->dims()[0]; - size_t inference_width = predict->dims()[1]; - const T *inference_data = predict->data(); - const auto *label_data = label->data(); - const int bucket_length = num_thresholds + 1; - auto stream = - ctx.template device_context().stream(); - if (slide_steps == 0) { - AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - label_data, inference_data, inference_width, num_thresholds, - origin_stat_pos, origin_stat_neg, batch_size, slide_steps); - return; - } - // the last number of origin_stat_pos store the index should be used in - // current step - int cur_step_index = - static_cast(origin_stat_pos[(slide_steps + 1) * bucket_length]) % - slide_steps; - int cur_step_begin = cur_step_index * bucket_length; - int sum_step_begin = slide_steps * bucket_length; - - ClearObsoleteDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - origin_stat_pos, origin_stat_neg, bucket_length, slide_steps); - - AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - label_data, inference_data, inference_width, num_thresholds, - origin_stat_pos, origin_stat_neg, batch_size, slide_steps); - UpdateSumDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - origin_stat_pos, origin_stat_neg, bucket_length, slide_steps); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(auc, - ops::AucCUDAKernel); diff --git a/paddle/fluid/operators/metrics/auc_op.h b/paddle/fluid/operators/metrics/auc_op.h deleted file mode 100644 index 10403472c69b57723bc714703c115f07d8640f7e..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/metrics/auc_op.h +++ /dev/null @@ -1,186 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class AucKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *predict = ctx.Input("Predict"); - auto *label = ctx.Input("Label"); - - int num_thresholds = ctx.Attr("num_thresholds"); - int slide_steps = ctx.Attr("slide_steps"); - - // Only use output var for now, make sure it's persistable and - // not cleaned up for each batch. - auto *auc_tensor = ctx.Output("AUC"); - auto *stat_pos = ctx.Output("StatPosOut"); - auto *stat_neg = ctx.Output("StatNegOut"); - - auto *origin_stat_pos = stat_pos->mutable_data(ctx.GetPlace()); - auto *origin_stat_neg = stat_neg->mutable_data(ctx.GetPlace()); - auto *auc_value = auc_tensor->mutable_data(ctx.GetPlace()); - - // Just for pass UT, since UT's input & output connot be set same var - auto *stat_pos_in_tensor = ctx.Input("StatPos"); - auto *pos_in_data = stat_pos_in_tensor->data(); - auto *stat_neg_in_tensor = ctx.Input("StatNeg"); - auto *neg_in_data = stat_neg_in_tensor->data(); - if (stat_pos_in_tensor != stat_pos) { - memcpy(origin_stat_pos, pos_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t)); - } - if (stat_neg_in_tensor != stat_neg) { - memcpy(origin_stat_neg, neg_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t)); - } - statAuc(label, predict, num_thresholds, slide_steps, origin_stat_pos, - origin_stat_neg); - - int sum_offset = slide_steps * (num_thresholds + 1); - calcAuc(origin_stat_pos + sum_offset, origin_stat_neg + sum_offset, - num_thresholds, auc_value); - if (slide_steps) { - origin_stat_pos[(slide_steps + 1) * (num_thresholds + 1)] += 1; - origin_stat_neg[(slide_steps + 1) * (num_thresholds + 1)] += 1; - } - } - - private: - inline static double trapezoidArea(double X1, double X2, double Y1, - double Y2) { - return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0; - } - - inline static void statAuc(const framework::Tensor *label, - const framework::Tensor *predict, - const int num_thresholds, const int slide_steps, - int64_t *origin_stat_pos, - int64_t *origin_stat_neg) { - size_t batch_size = predict->dims()[0]; - size_t inference_width = predict->dims()[1]; - const T *inference_data = predict->data(); - const auto *label_data = label->data(); - const int bucket_length = num_thresholds + 1; - if (slide_steps == 0) { - for (size_t i = 0; i < batch_size; i++) { - // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob - // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob - auto predict_data = - inference_data[i * inference_width + (inference_width - 1)]; - PADDLE_ENFORCE_LE(predict_data, 1, - platform::errors::PreconditionNotMet( - "The predict data must less or equal 1.")); - PADDLE_ENFORCE_GE(predict_data, 0, - platform::errors::PreconditionNotMet( - "The predict data must gather or equal 0.")); - - uint32_t binIdx = static_cast(predict_data * num_thresholds); - if (label_data[i] > 0) { - origin_stat_pos[binIdx] += 1; - } else if (label_data[i] == 0) { - origin_stat_neg[binIdx] += 1; - } - } - return; - } - // the last number of origin_stat_pos store the index should be used in - // current step - int cur_step_index = - static_cast(origin_stat_pos[(slide_steps + 1) * bucket_length]) % - slide_steps; - int cur_step_begin = cur_step_index * bucket_length; - int sum_step_begin = slide_steps * bucket_length; - for (int i = 0; i < bucket_length; ++i) { - origin_stat_pos[sum_step_begin + i] -= - origin_stat_pos[cur_step_begin + i]; - origin_stat_neg[sum_step_begin + i] -= - origin_stat_neg[cur_step_begin + i]; - } - - std::memset(origin_stat_pos + cur_step_begin, 0, - bucket_length * sizeof(int64_t)); - std::memset(origin_stat_neg + cur_step_begin, 0, - bucket_length * sizeof(int64_t)); - - for (size_t i = 0; i < batch_size; i++) { - // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob - // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob - auto predict_data = - inference_data[i * inference_width + (inference_width - 1)]; - PADDLE_ENFORCE_LE(predict_data, 1, - platform::errors::PreconditionNotMet( - "The predict data must less or equal 1.")); - PADDLE_ENFORCE_GE(predict_data, 0, - platform::errors::PreconditionNotMet( - "The predict data must gather or equal 0.")); - - uint32_t binIdx = static_cast(predict_data * num_thresholds); - if (label_data[i] > 0) { - origin_stat_pos[cur_step_begin + binIdx] += 1; - } else if (label_data[i] == 0) { - origin_stat_neg[cur_step_begin + binIdx] += 1; - } - } - for (int i = 0; i < bucket_length; ++i) { - origin_stat_pos[sum_step_begin + i] += - origin_stat_pos[cur_step_begin + i]; - origin_stat_neg[sum_step_begin + i] += - origin_stat_neg[cur_step_begin + i]; - } - } - - inline static void calcAuc(const int64_t *stat_pos, const int64_t *stat_neg, - int num_thresholds, double *auc) { - *auc = 0.0f; - - double totPos = 0.0; - double totNeg = 0.0; - double totPosPrev = 0.0; - double totNegPrev = 0.0; - - int idx = num_thresholds; - - while (idx >= 0) { - totPosPrev = totPos; - totNegPrev = totNeg; - totPos += stat_pos[idx]; - totNeg += stat_neg[idx]; - *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev); - --idx; - } - - if (totPos > 0.0 && totNeg > 0.0) { - *auc = *auc / totPos / totNeg; - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc index 90e6a36220ab04087cd02abd76f6c3598425573c..812c55cdd5055186d7fd83a2057d88256f3b34a3 100644 --- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc @@ -150,4 +150,5 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel { // TODO(jczaja): Enable FP32 when performance is good namespace ops = paddle::operators; REGISTER_OP_KERNEL(layer_norm, MKLDNN, ::paddle::platform::CPUPlace, + ops::LayerNormMKLDNNOpKernel, ops::LayerNormMKLDNNOpKernel); diff --git a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc index 780c6e7f153e7b1179e203bc7807dd7818aa591a..a3b764b0e1c46ab91b989ed7f7b0b5df101f7654 100644 --- a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc @@ -13,19 +13,32 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/shape_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" namespace paddle { namespace operators { -using paddle::framework::Tensor; +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = phi::SelectedRows; template -class ShapeMKLDNNKernel : public ShapeKernel { +class ShapeMKLDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ShapeKernel::Compute(ctx); + auto* in_var = ctx.InputVar("Input"); + framework::DDim in_dims; + if (in_var->IsType()) { + in_dims = in_var->Get().value().dims(); + } else { + in_dims = in_var->Get().dims(); + } + auto* out_t = ctx.Output("Out"); + out_t->Resize({in_dims.size()}); + auto out_data = out_t->mutable_data(platform::CPUPlace()); + for (int i = 0; i < in_dims.size(); ++i) { + out_data[i] = in_dims[i]; + } auto* out = ctx.Output("Out"); out->set_layout(framework::DataLayout::kMKLDNN); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index 2fdeecf89346fcf15f38b291ed5af49b8a2c8fc0..23428dd403e9b1ef62007c7b9193ed3b8482cab3 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -29,11 +29,11 @@ USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP(elementwise_mul); USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); -USE_OP(conv2d); +USE_OP_ITSELF(conv2d); USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32); namespace paddle { @@ -55,7 +55,7 @@ class CacheTester { onednn_dev_ctx_->ResetBlobMap(nullptr); } - bool Analyze(unsigned short int num_entries) { + bool Analyze(uint16_t num_entries) { // Number of created objects in cache should be as expected (num_entries) return onednn_dev_ctx_->GetCachedObjectsNumber() == num_entries; } diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc index c776cf2a7c792c429fcf45a367d3f06bf9add5d2..e9dadd5ec937cd11c84777a582cc1f7ac9fc3c33 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc @@ -27,7 +27,7 @@ USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc index 52e2caaeb6ee129b6971d29dac41465b0373d5e3..9d0062e31388413fd4a441687631faebe8846c6e 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc @@ -24,14 +24,17 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP(pool2d); USE_OP_DEVICE_KERNEL(pool2d, MKLDNN); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); -USE_OP(transpose); +USE_OP_ITSELF(transpose); USE_OP_DEVICE_KERNEL(transpose, MKLDNN); +PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT); + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc index f88286288317bd8e7c09cbd23ecccfce5df98e7d..6e3bd5e43c9c1d7e5c8a5dd4ba37afcfd7147e20 100644 --- a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc +++ b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc @@ -21,9 +21,8 @@ limitations under the License. */ namespace fw = paddle::framework; namespace plat = paddle::platform; -namespace math = paddle::operators::math; -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MLU); // relu diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc index 9de03582cbbf53e843e5f4531a6da6c1c2a87dd5..1fdaa153e3c27ed1a83696bf03d68dbfd2b93ae9 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.cc +++ b/paddle/fluid/operators/mlu/mlu_baseop.cc @@ -499,6 +499,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { output_desc, output)); } +/* static */ void MLUCnnl::Concat(const MLUDeviceContext& dev_ctx, + const int pack_num, const int axis, + const cnnlTensorDescriptor_t inputs_desc[], + const void* const inputs[], + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = dev_ctx.cnnl_handle(); + + size_t workspace_size = 0; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetConcatWorkspaceSize(handle, pack_num, &workspace_size)); + + Tensor workspace(paddle::experimental::DataType::INT8); + workspace.Resize(framework::DDim({static_cast(workspace_size)})); + void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlConcat(handle, pack_num, axis, inputs_desc, + inputs, workspace_ptr, workspace_size, + output_desc, output)); +} + /* static */ void MLUCnnl::Div( const ExecutionContext& ctx, cnnlComputationPreference_t prefer, const cnnlTensorDescriptor_t in0_desc, const void* in0, @@ -977,6 +998,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { output_descs, output_ptrs)); } +/* static */ void MLUCnnl::Split(const MLUDeviceContext& dev_ctx, int split_num, + int axis, + const cnnlTensorDescriptor_t input_desc, + const void* input_ptr, + const cnnlTensorDescriptor_t output_descs[], + void* output_ptrs[]) { + cnnlHandle_t handle = dev_ctx.cnnl_handle(); + + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetSplitWorkspaceSize(handle, split_num, &workspace_size)); + + Tensor workspace(paddle::experimental::DataType::INT8); + workspace.Resize(framework::DDim({static_cast(workspace_size)})); + void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSplit(handle, split_num, axis, input_desc, + input_ptr, workspace_ptr, workspace_size, + output_descs, output_ptrs)); +} + /* static */ void MLUCnnl::GatherFunctor( const ExecutionContext& ctx, const int axis, const int batch_dims, const cnnlTensorDescriptor_t params_desc, const void* params, diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index 2cbecba9fa081970221242555b6b805ff9acae83..b55b10686e92e2b1b5b3a7390289f8329ac04a04 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -403,6 +403,11 @@ class MLUCnnl { const void* const inputs[], const cnnlTensorDescriptor_t output_desc, void* output); + static void Concat(const MLUDeviceContext& dev_ctx, const int pack_num, + const int axis, const cnnlTensorDescriptor_t inputs_desc[], + const void* const inputs[], + const cnnlTensorDescriptor_t output_desc, void* output); + static void Cast(const ExecutionContext& ctx, cnnlCastDataType_t cast_type, const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t output_desc, void* output); @@ -566,6 +571,12 @@ class MLUCnnl { const cnnlTensorDescriptor_t output_descs[], void* output_ptrs[]); + static void Split(const MLUDeviceContext& dev_ctx, int split_num, int axis, + const cnnlTensorDescriptor_t input_desc, + const void* input_ptr, + const cnnlTensorDescriptor_t output_descs[], + void* output_ptrs[]); + static void Scale(const ExecutionContext& ctx, const int axis, const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t alpha_desc, const void* alpha, @@ -1157,19 +1168,22 @@ inline void TransposeFromMLUTensor(const ExecutionContext& ctx, const Tensor* transformed_input, Tensor* transformed_output, bool need_reshape_or_alloc) { - auto in_dims_vec = phi::vectorize(transformed_input->dims()); + const int dim_size = perm.size(); if (need_reshape_or_alloc) { + std::vector output_shape; + auto input_dims = transformed_input->dims(); + for (int i = 0; i < dim_size; ++i) { + output_shape.push_back(input_dims[perm[i]]); + } transformed_output->mutable_data( - {in_dims_vec[perm[0]], in_dims_vec[perm[1]], in_dims_vec[perm[2]], - in_dims_vec[perm[3]]}, - ctx.GetPlace()); + framework::DDim(output_shape.data(), dim_size), ctx.GetPlace()); } MLUCnnlTensorDesc trans_in_desc(*transformed_input, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); MLUCnnlTensorDesc trans_out_desc(*transformed_output, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); - MLUCnnl::Transpose(ctx, perm, in_dims_vec.size(), trans_in_desc.get(), + MLUCnnl::Transpose(ctx, perm, dim_size, trans_in_desc.get(), GetBasePtr(transformed_input), trans_out_desc.get(), GetBasePtr(transformed_output)); } diff --git a/paddle/fluid/operators/mode_op.cu b/paddle/fluid/operators/mode_op.cu index afb949d3374c62f561e910ea77e516bdb4004ac0..2bacda8afb0eb340c4c8d4068f3013e2adbc7f91 100644 --- a/paddle/fluid/operators/mode_op.cu +++ b/paddle/fluid/operators/mode_op.cu @@ -24,7 +24,6 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mode_op.h" #include "paddle/fluid/operators/top_k_function_cuda.h" -#include "paddle/fluid/operators/top_k_v2_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc index fe4609b3ad91e703fc28a997d5505d4cffa001a8..b309e1b87ef9033bd4302cdad4ea60a64cbf02eb 100644 --- a/paddle/fluid/operators/multi_dot_op.cc +++ b/paddle/fluid/operators/multi_dot_op.cc @@ -87,135 +87,6 @@ inline framework::DDim ComputeAndCheckShape( return out_dim; } -template -inline framework::Tensor MatMul(const framework::ExecutionContext& ctx, - const framework::Tensor& matrix_a, - const framework::Tensor& matrix_b, - const framework::DDim& a_dim, - const framework::DDim& b_dim) { - auto place = ctx.GetPlace(); - auto blas = phi::funcs::GetBlas(ctx); - - framework::Tensor matrix_c; - framework::DDim c_dim = phi::make_ddim({a_dim[0], b_dim[1]}); - matrix_c.Resize(c_dim); - matrix_c.mutable_data(place); - - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, false); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, false); - const T alpha = static_cast(1.0); - blas.MatMul(matrix_a, mat_dim_a, matrix_b, mat_dim_b, alpha, &matrix_c, T(0)); - return matrix_c; -} - -/** - * @brief Recursively calculate matrix multiplication according to the optimal - * order - * Let k = order[i,j], then ins[i...j] = ins[i...k] * ins[k+1 ...j] - * - * @param - * ins: the input tensors - * ins_dims: the shape of ins after reshape - * order: the optimal order - * i: the left of sub chain - * j: the righe of sub chain - * save_result: set true by backward - * results: save the intermediate result during backward - */ -template -inline framework::Tensor MatChainMul( - const framework::ExecutionContext& ctx, - const std::vector& ins, - const std::vector& ins_dims, - const std::vector& order, const uint64_t i, const uint64_t j, - const bool save_result, std::vector* results) { - if (i == j) { - return *ins[i]; - } - - const auto A = MatChainMul(ctx, ins, ins_dims, order, i, - order[i * ins.size() + j], - save_result, results); - framework::DDim a_dim = A.dims(); - if (i == order[i * ins.size() + j]) { - a_dim = ins_dims[i]; - } - - const auto B = MatChainMul(ctx, ins, ins_dims, order, - order[i * ins.size() + j] + 1, j, - save_result, results); - framework::DDim b_dim = B.dims(); - if (j == order[i * ins.size() + j] + 1) { - b_dim = ins_dims[j]; - } - - auto result = MatMul(ctx, A, B, a_dim, b_dim); - if (save_result) { - (*results)[i * ins.size() + j] = result; - } - return result; -} - -/** - * @brief get the optimal order - */ -std::vector GetOrder(const std::vector& ins, - const std::vector& ins_dims) { - auto n = ins.size(); - // p: save the ins shape, the ins[i] shape is (p[i], p[i+1]) - std::vector p(n + 1); - for (uint64_t i = 0; i < n; i++) { - p[i] = ins_dims[i][0]; - } - p[n] = ins_dims[n - 1][1]; - - // m[i, j]: save the lowest cost for multiplying ins[i...j] - std::vector m(n * n, 0); - // define ins[i...j] means multiplying matrices from ins[i] to ins[j] - // order[i, j] = k, this means that ins[i...k] and ins[k...j] fist and then - // multiply the resulting matrices is the optimal order for ins[i...j] - std::vector order(n * n); - for (uint64_t l = 1; l < n; l++) { - for (uint64_t i = 0; i < n - l; i++) { - auto j = i + l; - m[i * n + j] = 0xffffffff; - for (uint64_t k = i; k < j; k++) { - uint64_t q = - m[i * n + k] + m[(k + 1) * n + j] + p[i] * p[k + 1] * p[j + 1]; - if (q < m[i * n + j]) { - m[i * n + j] = q; - order[i * n + j] = k; - } - } - } - } - return order; -} - -template -static inline framework::Tensor MultiDotMatChainOrder( - const framework::ExecutionContext& ctx, - const std::vector& ins, - const std::vector& ins_dims, const bool save_result, - std::vector* results) { - auto order = GetOrder(ins, ins_dims); - return MatChainMul(ctx, ins, ins_dims, order, 0, - ins.size() - 1, save_result, results); -} - -inline void GetDims(const std::vector& ins, - std::vector* ins_dims) { - const auto n = ins.size(); - for (size_t i = 0; i < n; i++) { - (*ins_dims)[i] = ins[i]->dims(); - if (i == 0 && (*ins_dims)[i].size() == 1) { - (*ins_dims)[i] = phi::make_ddim({1, (*ins_dims)[i][0]}); - } else if (i == n - 1 && (*ins_dims)[i].size() == 1) { - (*ins_dims)[i] = phi::make_ddim({(*ins_dims)[i][0], 1}); - } - } -} - class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -252,78 +123,6 @@ class MultiDotOp : public framework::OperatorWithKernel { } }; -/** - * 1. there are only 2 matrices: direct matrix multiplication A*B - * 2. there are only 3 matrices: calculate the cost of (A*B)*C and A*(B*C), - * choose the least cost order for calculation - * 3. more than 3 matrices: call MultiDotMatChainOrder - */ -template -class MultiDotKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto ins = ctx.MultiInput("X"); - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - out->mutable_data(place); - - auto blas = phi::funcs::GetBlas(ctx); - - auto n = ins.size(); - std::vector ins_dims(n); - GetDims(ins, &ins_dims); - - const T scale = static_cast(1.0); - if (n == 2) { - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false); - blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, out, T(0)); - } else if (n == 3) { - const auto Ma = ins_dims[0][0]; - const auto Ka = ins_dims[0][1]; - const auto Nb = ins_dims[1][1]; - const auto Nc = ins_dims[2][1]; - const uint64_t cost1 = Ma * Nb * (Ka + Nc); - const uint64_t cost2 = Ka * Nc * (Nb + Ma); - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false); - auto mat_dim_c = - phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false); - if (cost1 < cost2) { - framework::Tensor tmp_out; - tmp_out.mutable_data(place, Ma * Nb * sizeof(T)); - framework::DDim tmp_dim = phi::make_ddim({Ma, Nb}); - blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out, - T(0)); - auto mat_dim_tmp = - phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false); - blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0)); - } else { - framework::Tensor tmp_out; - tmp_out.mutable_data(place, Ka * Nc * sizeof(T)); - framework::DDim tmp_dim = phi::make_ddim({Ka, Nc}); - blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out, - T(0)); - auto mat_dim_tmp = - phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false); - blas.MatMul(*ins[0], mat_dim_a, tmp_out, mat_dim_tmp, scale, out, T(0)); - } - } else { - std::vector results; - const auto tmp = MultiDotMatChainOrder( - ctx, ins, ins_dims, false, &results); - auto out_dim = out->dims(); - *out = tmp; - out->Resize(out_dim); - } - } -}; - class MultiDotOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -341,180 +140,6 @@ class MultiDotOpGrad : public framework::OperatorWithKernel { } }; -template -class MultiDotGradKernel : public framework::OpKernel { - public: - /** - * @brief calculate dA and dB - * dA = dout * transpose(B) - * dB = transpose(A) * dout - */ - void CalcGrad(const framework::ExecutionContext& ctx, - const framework::Tensor& dout, const framework::Tensor& A, - const framework::Tensor& B, const framework::DDim& dout_dim, - const framework::DDim& a_dim, const framework::DDim& b_dim, - framework::Tensor* dA, framework::Tensor* dB) const { - auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false); - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, true); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, true); - T alpha = static_cast(1.0); - auto blas = phi::funcs::GetBlas(ctx); - blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0)); - blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0)); - } - - /** - * @brief calculate multi matrix multiplication grad by a chain order - * @param - * dout: the grad of multi matrix multiplication out - * dx: the out grad of inputs - * ins: the input tensors - * ins_dims: the shape of ins after reshape - * order: the optimal order - * i: the left of sub chain - * j: the righe of sub chain - * results: the intermediate result of farward - */ - void MatChainMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor& dout, - std::vector* dx, - const std::vector& ins, - const framework::DDim& dout_dim, - const std::vector& ins_dims, - const std::vector& order, const uint64_t i, - const uint64_t j, - const std::vector& results) const { - if (i == j) { - *((*dx)[i]) = dout; - return; - } - - const auto n = ins.size(); - const auto right = order[i * n + j]; - const auto left = order[i * n + j] + 1; - // get the multi result of left sub chain - const auto* A = &results[i * n + right]; - framework::DDim a_dim = A->dims(); - if (i == right) { - A = ins[i]; - a_dim = ins_dims[i]; - } - // get the multi result of right sub chain - const auto* B = &results[left * n + j]; - framework::DDim b_dim = B->dims(); - if (left == j) { - B = ins[j]; - b_dim = ins_dims[j]; - } - framework::Tensor dA, dB; - dA.Resize({dout_dim[0], b_dim[0]}); - dB.Resize({a_dim[1], dout_dim[1]}); - dA.mutable_data(ctx.GetPlace()); - dB.mutable_data(ctx.GetPlace()); - - CalcGrad(ctx, dout, *A, *B, dout_dim, a_dim, b_dim, &dA, &dB); - MatChainMulGrad(ctx, dA, dx, ins, dA.dims(), ins_dims, order, i, right, - results); - MatChainMulGrad(ctx, dB, dx, ins, dB.dims(), ins_dims, order, left, j, - results); - } - - void MultiDotGradMatChainOrder( - const framework::ExecutionContext& ctx, const framework::Tensor& dout, - const std::vector& ins, - const framework::DDim& dout_dim, - const std::vector& ins_dims, - std::vector* dx) const { - auto order = GetOrder(ins, ins_dims); - auto n = ins.size(); - std::vector results(n * n); - MatChainMul(ctx, ins, ins_dims, order, 0, n - 1, true, - &results); - MatChainMulGrad(ctx, dout, dx, ins, dout_dim, ins_dims, order, 0, n - 1, - results); - } - - void Compute(const framework::ExecutionContext& ctx) const { - auto ins = ctx.MultiInput("X"); - auto dout = *ctx.Input(framework::GradVarName("Out")); - auto dx = ctx.MultiOutput(framework::GradVarName("X")); - - auto blas = phi::funcs::GetBlas(ctx); - auto place = ctx.GetPlace(); - - const auto n = ins.size(); - for (size_t i = 0; i < n; i++) { - dx[i]->mutable_data(place); - } - - std::vector ins_dims(n); - GetDims(ins, &ins_dims); - - framework::DDim dout_dim = dout.dims(); - if (ins[0]->dims().size() == 1 && ins[n - 1]->dims().size() == 1) { - dout_dim = phi::make_ddim({1, 1}); - } else if (ins[0]->dims().size() == 1) { - if (dout_dim.size() == 1) { - dout_dim = phi::make_ddim({1, dout_dim[0]}); - } - } else if (ins[n - 1]->dims().size() == 1) { - if (dout_dim.size() == 1) { - dout_dim = phi::make_ddim({dout_dim[0], 1}); - } - } - - T alpha = static_cast(1); - auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false); - if (n == 2) { - CalcGrad(ctx, dout, *ins[0], *ins[1], dout_dim, ins_dims[0], ins_dims[1], - dx[0], dx[1]); - } else if (n == 3) { - const auto Ma = ins_dims[0][0]; - const auto Ka = ins_dims[0][1]; - const auto Nb = ins_dims[1][1]; - const auto Nc = ins_dims[2][1]; - const uint64_t cost1 = Ma * Nb * (Ka + Nc); - const uint64_t cost2 = Ka * Nc * (Nb + Ma); - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false); - auto mat_dim_c = - phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false); - if (cost1 < cost2) { - framework::Tensor tmp_out, tmp_dout; - tmp_out.Resize({Ma, Nb}); - tmp_out.mutable_data(place); - tmp_dout.Resize({mat_dim_dout.height_, Nb}); - tmp_dout.mutable_data(place); - blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, alpha, &tmp_out, - T(0)); - CalcGrad(ctx, dout, tmp_out, *ins[2], dout_dim, tmp_out.dims(), - ins_dims[2], &tmp_dout, dx[2]); - CalcGrad(ctx, tmp_dout, *ins[0], *ins[1], tmp_dout.dims(), ins_dims[0], - ins_dims[1], dx[0], dx[1]); - } else { - framework::Tensor tmp_out, tmp_dout; - tmp_out.Resize({Ka, Nc}); - tmp_out.mutable_data(place); - tmp_dout.Resize({Ka, mat_dim_dout.width_}); - tmp_dout.mutable_data(place); - blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, alpha, &tmp_out, - T(0)); - CalcGrad(ctx, dout, *ins[0], tmp_out, dout_dim, ins_dims[0], - tmp_dout.dims(), dx[0], &tmp_dout); - CalcGrad(ctx, tmp_dout, *ins[1], *ins[2], tmp_dout.dims(), ins_dims[1], - ins_dims[2], dx[1], dx[2]); - } - } else { - MultiDotGradMatChainOrder(ctx, dout, ins, dout_dim, ins_dims, &dx); - if (ins[n - 1]->dims().size() == 1) { - dx[n - 1]->Resize({dx[n - 1]->dims()[0]}); - } - } - } -}; - template class MultiDotOpGradMaker : public framework::SingleGradOpMaker { public: @@ -552,25 +177,3 @@ REGISTER_OPERATOR(multi_dot, ops::MultiDotOp, ops::MultiDotOpMaker, REGISTER_OPERATOR(multi_dot_grad, ops::MultiDotOpGrad, ops::MultiDotOpDoubleGradMaker, ops::MultiDotOpDoubleGradMaker); - -REGISTER_OP_CPU_KERNEL( - multi_dot, ops::MultiDotKernel, - ops::MultiDotKernel); -REGISTER_OP_CPU_KERNEL( - multi_dot_grad, - ops::MultiDotGradKernel, - ops::MultiDotGradKernel); - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -REGISTER_OP_CUDA_KERNEL( - multi_dot, ops::MultiDotKernel, - ops::MultiDotKernel, - ops::MultiDotKernel); -REGISTER_OP_CUDA_KERNEL( - multi_dot_grad, - ops::MultiDotGradKernel, - ops::MultiDotGradKernel, - ops::MultiDotGradKernel); -#endif diff --git a/paddle/fluid/operators/multinomial_op.cc b/paddle/fluid/operators/multinomial_op.cc index 1143f9cb37aa54bea430d3a8bca8b62b02da4e2b..0113f638b9a47d161c890a0f547f8680af4018e7 100644 --- a/paddle/fluid/operators/multinomial_op.cc +++ b/paddle/fluid/operators/multinomial_op.cc @@ -53,8 +53,8 @@ class MultinomialOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -DELCARE_INFER_SHAPE_FUNCTOR(multinomial, MultinomialInferShapeFunctor, - PT_INFER_META(phi::MultinomialInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(multinomial, MultinomialInferShapeFunctor, + PD_INFER_META(phi::MultinomialInferMeta)); REGISTER_OPERATOR( multinomial, ops::MultinomialOp, ops::MultinomialOpMaker, paddle::framework::EmptyGradOpMaker, diff --git a/paddle/fluid/operators/mv_op.cc b/paddle/fluid/operators/mv_op.cc index ab9f10070fc60deab8974ae0e81e2b4c6cef2ffd..bf7222fc45c66085473eae627abe97b8a41d4268 100644 --- a/paddle/fluid/operators/mv_op.cc +++ b/paddle/fluid/operators/mv_op.cc @@ -16,8 +16,11 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -42,33 +45,6 @@ class MVOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContext *context) const override { - OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "mv"); - OP_INOUT_CHECK(context->HasInput("Vec"), "Input", "Vec", "mv"); - OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "mv"); - - auto dim_x = context->GetInputDim("X"); - auto dim_vec = context->GetInputDim("Vec"); - PADDLE_ENFORCE_EQ( - dim_x.size(), 2, - platform::errors::InvalidArgument( - "The rank of input X should be 2, but is %d", dim_x.size())); - PADDLE_ENFORCE_EQ( - dim_vec.size(), 1, - platform::errors::InvalidArgument( - "The rank of input Vec should be 1, but is %d", dim_vec.size())); - PADDLE_ENFORCE_EQ(dim_x[1], dim_vec[0], - platform::errors::InvalidArgument( - "X's second dimension is expected to be equal to " - "Vec's first dimension" - "but recieved X'shape = [%s], Vec's shape = [%s]", - dim_x, dim_vec)); - - framework::DDim dim_out = phi::make_ddim({dim_x[0]}); - - context->SetOutputDim("Out", dim_out); - context->ShareLoD("X", /*->*/ "Out"); - } }; template @@ -118,7 +94,11 @@ class MVOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(mv, MvInferShapeFunctor, + PD_INFER_META(phi::MvInferMeta)); + REGISTER_OPERATOR(mv, ops::MVOp, ops::MVOpMaker, ops::MVOpGradMaker, - ops::MVOpGradMaker); + ops::MVOpGradMaker, + MvInferShapeFunctor); REGISTER_OPERATOR(mv_grad, ops::MVOpGrad); diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc index f510c7bebec876d034c1af923a4f7077c096000c..a4e1f7b3091a9f692e479300310333bfdd359096 100644 --- a/paddle/fluid/operators/nll_loss_op.cc +++ b/paddle/fluid/operators/nll_loss_op.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/nll_loss_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -23,77 +25,6 @@ class NLLLossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NLLLoss"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "NLLLoss"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NLLLoss"); - OP_INOUT_CHECK(ctx->HasOutput("Total_weight"), "Output", "Total_weight", - "NLLLoss"); - - auto x_dims = ctx->GetInputDim("X"); - auto label_dims = ctx->GetInputDim("Label"); - auto reduction = ctx->Attrs().Get("reduction"); - - PADDLE_ENFORCE_EQ(x_dims.size() == 2 || x_dims.size() == 4, true, - platform::errors::InvalidArgument( - "The tensor rank of Input(X) must be 2 or 4.")); - bool contain_unknown_dim = phi::contain_unknown_dim(x_dims) || - phi::contain_unknown_dim(label_dims); - bool check = ctx->IsRuntime() || !contain_unknown_dim; - if (check) { - PADDLE_ENFORCE_EQ( - x_dims[0], label_dims[0], - platform::errors::InvalidArgument( - "ShapeError: Expected input batch_size to match label batch_size," - "But received: the Input(x) batch_size is [%s], the Input(label) " - " batch_size is [%s].", - x_dims[0], label_dims[0])); - if (ctx->HasInput("Weight")) { - auto w_dims = ctx->GetInputDim("Weight"); - PADDLE_ENFORCE_EQ(w_dims.size(), 1, - platform::errors::InvalidArgument( - "Input(Weight) should be a 1D tensor.")); - PADDLE_ENFORCE_EQ( - x_dims[1], w_dims[0], - platform::errors::InvalidArgument( - "Expected input tensor Weight's size should equal " - "to the first dimension of the input tensor X. But received " - "Weight's " - "size is %d, the first dimension of input X is %d", - w_dims[0], x_dims[1])); - } - } - if (x_dims.size() == 2) { - if (reduction == "none") { - ctx->SetOutputDim("Out", {x_dims[0]}); - } else { - ctx->SetOutputDim("Out", {1}); - } - } else if (x_dims.size() == 4) { - PADDLE_ENFORCE_EQ(label_dims.size(), 3, - platform::errors::InvalidArgument( - "Expected Input(Lable) dimensions=3, received %d.", - label_dims.size())); - auto input0 = x_dims[0]; - auto input2 = x_dims[2]; - auto input3 = x_dims[3]; - auto label0 = label_dims[0]; - auto label1 = label_dims[1]; - auto label2 = label_dims[2]; - PADDLE_ENFORCE_EQ( - input0 == label0 && input2 == label1 && input3 == label2, true, - platform::errors::InvalidArgument("Input(X) tensor shape should " - "match to Input(Label) tensor " - "shape.")); - if (reduction == "none") { - ctx->SetOutputDim("Out", {x_dims[0], x_dims[2], x_dims[3]}); - } else { - ctx->SetOutputDim("Out", {1}); - } - } - ctx->SetOutputDim("Total_weight", {1}); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -259,15 +190,11 @@ class NLLLossGradMaker : public framework::SingleGradOpMaker { } // namespace operators } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(nll_loss, NllLossRawInferShapeFunctor, + PD_INFER_META(phi::NllLossRawInferMeta)); namespace ops = paddle::operators; REGISTER_OPERATOR(nll_loss, ops::NLLLossOp, ops::NLLLossOpMaker, ops::NLLLossGradMaker, - ops::NLLLossGradMaker); + ops::NLLLossGradMaker, + NllLossRawInferShapeFunctor); REGISTER_OPERATOR(nll_loss_grad, ops::NLLLossGradOp); -REGISTER_OP_CPU_KERNEL( - nll_loss, ops::NLLLossOpKernel, - ops::NLLLossOpKernel); -REGISTER_OP_CPU_KERNEL( - nll_loss_grad, - ops::NLLLossGradOpKernel, - ops::NLLLossGradOpKernel); diff --git a/paddle/fluid/operators/nll_loss_op.h b/paddle/fluid/operators/nll_loss_op.h deleted file mode 100644 index be6f4422d4ac6a475477c025c4b76eabdbf4f9e0..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/nll_loss_op.h +++ /dev/null @@ -1,306 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -static void nll_loss_1D(T* out_data, T* total_weight_data, const T* x_data, - const int64_t* label_data, const T* weight_data, - const int64_t batch_size, const int64_t n_classes, - const std::string reduction, - const int64_t ignore_index) { - if (reduction == "none") { - for (int64_t i = 0; i < batch_size; ++i) { - const auto cur_label = label_data[i]; - if (cur_label == ignore_index) { - out_data[i] = 0; - continue; - } - PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, - platform::errors::InvalidArgument( - "Label value is out of range. " - "Expected label value in range of [0, %d), but " - "received value is %d.", - n_classes, cur_label)); - - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - out_data[i] = -x_data[i * n_classes + cur_label] * cur_weight; - } - return; - } - - T output_val = 0; - T total_weight_val = 0; - - for (int64_t i = 0; i < batch_size; i++) { - const auto cur_label = label_data[i]; - if (cur_label == ignore_index) { - out_data[i] = 0; - continue; - } - PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, - platform::errors::InvalidArgument( - "label should not be out of bounds.")); - - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - total_weight_val += cur_weight; - output_val -= x_data[i * n_classes + cur_label] * cur_weight; - } - if (reduction == "mean" && total_weight_val != 0) { - output_val /= total_weight_val; - } - *out_data = output_val; - *total_weight_data = total_weight_val; -} - -template -static void nll_loss_2D(T* out_data, T* total_weight_data, const T* x_data, - const int64_t* label_data, const T* weight_data, - const int64_t batch_size, const int64_t n_classes, - const int64_t in_dim2, const int64_t in_dim3, - const std::string reduction, - const int64_t ignore_index) { - const auto map_size = in_dim2 * in_dim3; - const auto sample_size = n_classes * map_size; - if (reduction == "none") { - for (int i = 0; i < batch_size; i++) { - for (int h = 0; h < in_dim2; h++) { - for (int w = 0; w < in_dim3; w++) { - const auto index = i * map_size + h * in_dim3 + w; - const auto cur_label = label_data[index]; - if (cur_label == ignore_index) { - out_data[index] = 0; - continue; - } - PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, - platform::errors::InvalidArgument( - "label should not be out of bounds.")); - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - out_data[index] = -x_data[i * sample_size + cur_label * map_size + - h * in_dim3 + w] * - cur_weight; - } - } - } - return; - } - - T output_val = 0; - T total_weight_val = 0; - - for (int i = 0; i < batch_size; i++) { - for (int h = 0; h < in_dim2; h++) { - for (int w = 0; w < in_dim3; w++) { - const auto index = i * map_size + h * in_dim3 + w; - const auto cur_label = label_data[index]; - if (cur_label == ignore_index) { - out_data[index] = 0; - continue; - } - PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, - platform::errors::InvalidArgument( - "label should not be out of bounds.")); - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - total_weight_val += cur_weight; - output_val -= - x_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] * - cur_weight; - } - } - } - - if (reduction == "mean" && total_weight_val != 0) { - output_val /= total_weight_val; - } - *out_data = output_val; - *total_weight_data = total_weight_val; -} - -template -class NLLLossOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* weight = ctx.Input("Weight"); - auto* out = ctx.Output("Out"); - auto* total_weight = ctx.Output("Total_weight"); - auto reduction = ctx.Attr("reduction"); - auto ignore_index = ctx.Attr("ignore_index"); - - auto x_data = x->data(); - auto label_data = labels->data(); - auto weight_data = weight ? weight->data() : nullptr; - auto out_data = out->mutable_data(ctx.GetPlace()); - auto total_weight_data = total_weight->mutable_data(ctx.GetPlace()); - *total_weight_data = 0; - - auto x_dims = x->dims(); - const auto batch_size = x_dims[0]; - const auto n_classes = x_dims[1]; - - if (x_dims.size() == 2) { - nll_loss_1D(out_data, total_weight_data, x_data, label_data, - weight_data, batch_size, n_classes, reduction, - ignore_index); - } else if (x_dims.size() == 4) { - const auto in_dim2 = x_dims[2]; - const auto in_dim3 = x_dims[3]; - nll_loss_2D(out_data, total_weight_data, x_data, label_data, - weight_data, batch_size, n_classes, in_dim2, in_dim3, - reduction, ignore_index); - } - } -}; - -template -static void nll_loss_grad_1D(T* dx_data, const T* dout_data, - const int64_t* label_data, const T* weight_data, - const T* total_weight_data, - const int64_t batch_size, const int64_t n_classes, - const std::string reduction, - const int64_t ignore_index) { - if (reduction == "none") { - for (int i = 0; i < batch_size; i++) { - const auto cur_label = label_data[i]; - if (cur_label == ignore_index) { - continue; - } - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - dx_data[i * n_classes + cur_label] = -dout_data[i] * cur_weight; - } - return; - } - - const T dout_val = *dout_data; - const T total_weight_val = *total_weight_data; - for (int i = 0; i < batch_size; i++) { - const auto cur_label = label_data[i]; - if (cur_label == ignore_index) { - continue; - } - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - dx_data[i * n_classes + cur_label] = -dout_val * cur_weight; - if (reduction == "mean") { - dx_data[i * n_classes + cur_label] /= total_weight_val; - } - } -} - -template -static void nll_loss_grad_2D(T* dx_data, const T* dout_data, - const int64_t* label_data, const T* weight_data, - const T* total_weight_data, - const int64_t batch_size, const int64_t n_classes, - const int64_t in_dim2, const int64_t in_dim3, - const std::string reduction, - const int64_t ignore_index) { - const auto map_size = in_dim2 * in_dim3; - const auto sample_size = n_classes * map_size; - - if (reduction == "none") { - for (int i = 0; i < batch_size; i++) { - for (int h = 0; h < in_dim2; h++) { - for (int w = 0; w < in_dim3; w++) { - const auto index = i * map_size + h * in_dim3 + w; - const auto cur_label = label_data[index]; - if (cur_label == ignore_index) { - continue; - } - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - dx_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] = - -cur_weight * dout_data[index]; - } - } - } - return; - } - - const T dout_val = *dout_data; - const T total_weight_val = *total_weight_data; - for (int i = 0; i < batch_size; i++) { - for (int h = 0; h < in_dim2; h++) { - for (int w = 0; w < in_dim3; w++) { - const auto index = i * map_size + h * in_dim3 + w; - const auto cur_label = label_data[index]; - if (cur_label == ignore_index) { - continue; - } - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - const auto dx_index = - i * sample_size + cur_label * map_size + h * in_dim3 + w; - dx_data[dx_index] = -dout_val * cur_weight; - if (reduction == "mean") { - dx_data[dx_index] /= total_weight_val; - } - } - } - } -} - -template -class NLLLossGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* weight = ctx.Input("Weight"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* total_weight = ctx.Input("Total_weight"); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto ignore_index = ctx.Attr("ignore_index"); - auto reduction = ctx.Attr("reduction"); - - auto dx_data = dx->mutable_data(ctx.GetPlace()); - auto dout_data = dout->data(); - auto label_data = labels->data(); - auto weight_data = weight ? weight->data() : nullptr; - auto total_weight_data = total_weight->data(); - memset(dx_data, 0, dx->numel() * sizeof(T)); - - const auto x_dims = x->dims(); - const auto batch_size = x_dims[0]; - const auto n_classes = x_dims[1]; - - if (x_dims.size() == 2) { - nll_loss_grad_1D(dx_data, dout_data, label_data, weight_data, - total_weight_data, batch_size, n_classes, reduction, - ignore_index); - } else if (x_dims.size() == 4) { - const auto in_dim2 = x_dims[2]; - const auto in_dim3 = x_dims[3]; - nll_loss_grad_2D(dx_data, dout_data, label_data, weight_data, - total_weight_data, batch_size, n_classes, in_dim2, - in_dim3, reduction, ignore_index); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h index c400a8f4239a605414bf0d99a6a89b0ddae6c535..0ed1f2719de25bd2c138c23dd69b914a66961464 100644 --- a/paddle/fluid/operators/norm_utils.cu.h +++ b/paddle/fluid/operators/norm_utils.cu.h @@ -389,11 +389,12 @@ __global__ void DoubleGradComputeDDYWithGlobal( } template -void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, +void NormDoubleGradFunctor(const DeviceContext &ctx, const DataLayout data_layout, const Tensor *X, const Tensor *Scale, const Tensor *dY, const Tensor *Saved_mean, - const Tensor *Saved_variance, const double epsilon, + const Tensor *Saved_variance, const Tensor *Mean, + const Tensor *Variance, const double epsilon, const bool use_global_stats, const Tensor *ddX, const Tensor *ddScale, const Tensor *ddBias, Tensor *dX, Tensor *dScale, Tensor *ddY) { @@ -404,8 +405,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data()); const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data()); - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant set_constant; + phi::funcs::SetConstant set_constant; auto &x_dims = X->dims(); const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] @@ -416,7 +416,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, Tensor scale_tmp; if (!Scale) { scale_tmp.mutable_data({C}, ctx.GetPlace()); - set_constant(dev_ctx, &scale_tmp, static_cast(1)); + set_constant(ctx, &scale_tmp, static_cast(1)); } const T *scale_data = Scale ? Scale->data() : scale_tmp.data(); #ifdef __HIPCC__ @@ -424,15 +424,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, #else const int block = 512; #endif - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + int max_threads = ctx.GetMaxPhysicalThreadCount(); const int max_blocks = std::max(max_threads / block, 1); int grid = std::min(C, max_blocks); int grid1 = (num + block - 1) / block; const T *mean_data, *variance_data; if (use_global_stats) { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_var = ctx.Input("Variance"); + const auto *running_mean = Mean; + const auto *running_var = Variance; const auto *running_mean_data = running_mean->template data(); const auto *running_var_data = running_var->template data(); mean_data = running_mean_data; @@ -440,34 +440,35 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, } else { const T *smean_data = Saved_mean->data(); const T *svariance_data = Saved_variance->data(); + mean_data = smean_data; variance_data = svariance_data; } if (dX) { T *dx_data = dX->mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, dX, static_cast(0)); + set_constant(ctx, dX, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDXWithGlobal< - T, DataLayout::kNHWC><<>>( + T, DataLayout::kNHWC><<>>( dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dx_data); } else { DoubleGradComputeDXWithGlobal< - T, DataLayout::kNCHW><<>>( + T, DataLayout::kNCHW><<>>( dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dx_data); } } else { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDX< - T, block, DataLayout::kNHWC><<>>( + T, block, DataLayout::kNHWC><<>>( x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, ddscale_data, N, C, sample_size, epsilon, dx_data); } else { DoubleGradComputeDX< - T, block, DataLayout::kNCHW><<>>( + T, block, DataLayout::kNCHW><<>>( x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, ddscale_data, N, C, sample_size, epsilon, dx_data); } @@ -475,28 +476,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, } if (dScale) { T *dscale_data = dScale->mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, dScale, static_cast(0)); + set_constant(ctx, dScale, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDScaleWithGlobal< - T, block, DataLayout::kNHWC><<>>( + T, block, DataLayout::kNHWC><<>>( ddx_data, variance_data, dy_data, epsilon, N, C, sample_size, dscale_data); } else { DoubleGradComputeDScaleWithGlobal< - T, block, DataLayout::kNCHW><<>>( + T, block, DataLayout::kNCHW><<>>( ddx_data, variance_data, dy_data, epsilon, N, C, sample_size, dscale_data); } } else { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDScale< - T, block, DataLayout::kNHWC><<>>( + T, block, DataLayout::kNHWC><<>>( x_data, mean_data, variance_data, ddx_data, dy_data, N, C, sample_size, epsilon, dscale_data); } else { DoubleGradComputeDScale< - T, block, DataLayout::kNCHW><<>>( + T, block, DataLayout::kNCHW><<>>( x_data, mean_data, variance_data, ddx_data, dy_data, N, C, sample_size, epsilon, dscale_data); } @@ -504,28 +505,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, } if (ddY) { T *ddy_data = ddY->mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, ddY, static_cast(0)); + set_constant(ctx, ddY, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDDYWithGlobal< - T, DataLayout::kNHWC><<>>( + T, DataLayout::kNHWC><<>>( ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data, ddscale_data, epsilon, C, sample_size, num, ddy_data); } else { DoubleGradComputeDDYWithGlobal< - T, DataLayout::kNCHW><<>>( + T, DataLayout::kNCHW><<>>( ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data, ddscale_data, epsilon, C, sample_size, num, ddy_data); } } else { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDDY< - T, block, DataLayout::kNHWC><<>>( + T, block, DataLayout::kNHWC><<>>( x_data, mean_data, variance_data, ddscale_data, ddbias_data, ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data); } else { DoubleGradComputeDDY< - T, block, DataLayout::kNCHW><<>>( + T, block, DataLayout::kNCHW><<>>( x_data, mean_data, variance_data, ddscale_data, ddbias_data, ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data); } diff --git a/paddle/fluid/operators/op_debug_string_test.cc b/paddle/fluid/operators/op_debug_string_test.cc index b96fcaa486cce8099cf1d03c7d948ea74c1923ad..372a71706ab5ec72b6da4cbac1b63333f42cb265 100644 --- a/paddle/fluid/operators/op_debug_string_test.cc +++ b/paddle/fluid/operators/op_debug_string_test.cc @@ -17,8 +17,10 @@ #include "glog/logging.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add_grad); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc index ad7f93d73e902bbac684832d3a77ba83b517daf6..315831ddc0f290cc8c7ad1b78ce8625722f91d3b 100644 --- a/paddle/fluid/operators/optimizers/adadelta_op.cc +++ b/paddle/fluid/operators/optimizers/adadelta_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/optimizers/adadelta_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -23,77 +26,6 @@ class AdadeltaOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true, - platform::errors::InvalidArgument( - "Input(Param) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true, - platform::errors::InvalidArgument( - "Input(Grad) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("AvgSquaredGrad"), true, - platform::errors::InvalidArgument( - "Input(AvgSquaredGrad) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("AvgSquaredUpdate"), true, - platform::errors::InvalidArgument( - "Input(AvgSquaredUpdate) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->GetInputsVarType("Param").front() == - framework::proto::VarType::LOD_TENSOR, - true, - platform::errors::InvalidArgument( - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Param").front(), - ctx->GetInputsVarType("Param").front())); - PADDLE_ENFORCE_EQ( - ctx->GetInputsVarType("Grad").front() == - framework::proto::VarType::LOD_TENSOR, - true, - platform::errors::InvalidArgument( - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Grad").front(), - ctx->GetInputsVarType("Grad").front())); - - PADDLE_ENFORCE_EQ( - ctx->HasOutput("ParamOut"), true, - platform::errors::InvalidArgument( - "Output(ParamOut) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("AvgSquaredGradOut"), true, - platform::errors::InvalidArgument( - "Output(AvgSquaredGradOut) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("AvgSquaredUpdateOut"), true, - platform::errors::InvalidArgument( - "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null.")); - - auto param_dim = ctx->GetInputDim("Param"); - PADDLE_ENFORCE_EQ( - param_dim, ctx->GetInputDim("Grad"), - platform::errors::InvalidArgument( - "Param and grad input of AdadeltaOp should have same dimension.")); - PADDLE_ENFORCE_NE( - phi::product(ctx->GetInputDim("AvgSquaredGrad")), 0, - platform::errors::InvalidArgument( - "Maybe the Input variable AvgSquaredGrad has not " - "been initialized. You may need to confirm if you put " - "exe.run(startup_program) after optimizer.minimize " - "function.")); - PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"), - platform::errors::InvalidArgument( - "Param and AvgSquaredGrad input of AdadeltaOp " - "should have same dimension")); - PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"), - platform::errors::InvalidArgument( - "Param and AvgSquaredUpdate input of AdadeltaOp " - "should have same dimension")); - - ctx->SetOutputDim("ParamOut", param_dim); - ctx->SetOutputDim("AvgSquaredGradOut", param_dim); - ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( @@ -149,7 +81,11 @@ $$ } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker); -REGISTER_OP_CPU_KERNEL( - adadelta, ops::AdadeltaOpKernel, - ops::AdadeltaOpKernel); +namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(adadelta, AdadeltaInferMetaFunctor, + PD_INFER_META(phi::AdadeltaInferMeta)); +REGISTER_OPERATOR( + adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + AdadeltaInferMetaFunctor); diff --git a/paddle/fluid/operators/optimizers/adadelta_op.h b/paddle/fluid/operators/optimizers/adadelta_op.h deleted file mode 100644 index 85cfad35858bbe6b112169f196c0711d981e9446..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/optimizers/adadelta_op.h +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class AdadeltaOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type()))); - const auto* grad_var = ctx.InputVar("Grad"); - PADDLE_ENFORCE_EQ(grad_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Grad").front(), - framework::ToTypeName(grad_var->Type()))); - - auto param_out_tensor = ctx.Output("ParamOut"); - auto avg_squared_grad_out_tensor = - ctx.Output("AvgSquaredGradOut"); - auto avg_squared_update_out_tensor = - ctx.Output("AvgSquaredUpdateOut"); - - param_out_tensor->mutable_data(ctx.GetPlace()); - avg_squared_grad_out_tensor->mutable_data(ctx.GetPlace()); - avg_squared_update_out_tensor->mutable_data(ctx.GetPlace()); - - T rho = static_cast(ctx.Attr("rho")); - T epsilon = static_cast(ctx.Attr("epsilon")); - - auto param = framework::EigenVector::Flatten( - *ctx.Input("Param")); - auto grad = framework::EigenVector::Flatten( - *ctx.Input("Grad")); - // Squared gradient accumulator - auto avg_squared_grad = framework::EigenVector::Flatten( - *ctx.Input("AvgSquaredGrad")); - // Squared updates accumulator - auto avg_squared_update = framework::EigenVector::Flatten( - *ctx.Input("AvgSquaredUpdate")); - auto param_out = framework::EigenVector::Flatten(*param_out_tensor); - auto avg_squared_grad_out = - framework::EigenVector::Flatten(*avg_squared_grad_out_tensor); - auto avg_squared_update_out = - framework::EigenVector::Flatten(*avg_squared_update_out_tensor); - auto& place = *ctx.template device_context().eigen_device(); - - avg_squared_grad_out.device(place) = - rho * avg_squared_grad + (1 - rho) * grad.square(); - auto update = - -((avg_squared_update + epsilon) / (avg_squared_grad_out + epsilon)) - .sqrt() * - grad; - avg_squared_update_out.device(place) = - rho * avg_squared_update + (1 - rho) * update.square(); - param_out.device(place) = param + update; - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc index a95a37c980c8c9d41dc9fd352e3dace787a7c4e9..036839dd1300feac544a6f1ca661598f4360f745 100644 --- a/paddle/fluid/operators/optimizers/adamax_op.cc +++ b/paddle/fluid/operators/optimizers/adamax_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/optimizers/adamax_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -22,67 +25,6 @@ class AdamaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Param"), "Input", "Param", "Adamax"); - OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "Adamax"); - OP_INOUT_CHECK(ctx->HasInput("Moment"), "Input", "Moment", "Adamax"); - OP_INOUT_CHECK(ctx->HasInput("InfNorm"), "Input", "InfNorm", "Adamax"); - OP_INOUT_CHECK(ctx->HasInput("LearningRate"), "Input", "LearningRate", - "Adamax"); - OP_INOUT_CHECK(ctx->HasInput("Beta1Pow"), "Input", "Beta1Pow", "Adamax"); - PADDLE_ENFORCE_EQ( - ctx->GetInputsVarType("Param").front(), - framework::proto::VarType::LOD_TENSOR, - platform::errors::InvalidArgument( - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Param").front(), - ctx->GetInputsVarType("Param").front())); - PADDLE_ENFORCE_EQ( - ctx->GetInputsVarType("Grad").front(), - framework::proto::VarType::LOD_TENSOR, - platform::errors::InvalidArgument( - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Grad").front(), - ctx->GetInputsVarType("Grad").front())); - - OP_INOUT_CHECK(ctx->HasOutput("ParamOut"), "Output", "ParamOut", "Adamax"); - OP_INOUT_CHECK(ctx->HasOutput("MomentOut"), "Output", "MomentOut", - "Adamax"); - OP_INOUT_CHECK(ctx->HasOutput("InfNormOut"), "Output", "InfNormOut", - "Adamax"); - - auto lr_dims = ctx->GetInputDim("LearningRate"); - PADDLE_ENFORCE_NE(phi::product(lr_dims), 0, - platform::errors::InvalidArgument( - "Maybe the Input variable LearningRate has not " - "been initialized. You may need to confirm " - "if you put exe.run(startup_program) " - "after optimizer.minimize function.")); - PADDLE_ENFORCE_EQ(phi::product(lr_dims), 1, - platform::errors::InvalidArgument( - "Learning rate should have 1 dimension")); - auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); - PADDLE_ENFORCE_EQ(phi::product(beta1_pow_dims), 1, - platform::errors::InvalidArgument( - "Beta1 power accumulator should have 1 dimension")); - auto param_dims = ctx->GetInputDim("Param"); - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Grad"), - platform::errors::InvalidArgument( - "Param and Grad input of AdamaxOp should have same dimension")); - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Moment"), - platform::errors::InvalidArgument( - "Param and Moment input of AdamaxOp should have same dimension")); - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("InfNorm"), - platform::errors::InvalidArgument( - "Param and InfNorm input of AdamaxOp should have same dimension")); - - ctx->SetOutputDim("ParamOut", param_dims); - ctx->SetOutputDim("MomentOut", param_dims); - ctx->SetOutputDim("InfNormOut", param_dims); - } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( @@ -150,7 +92,11 @@ division by 0 error. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker); -REGISTER_OP_CPU_KERNEL( - adamax, ops::AdamaxOpKernel, - ops::AdamaxOpKernel); +DECLARE_INFER_SHAPE_FUNCTOR(adamax, AdamaxInferMetaFunctor, + PD_INFER_META(phi::AdamaxInferMeta)); + +REGISTER_OPERATOR( + adamax, ops::AdamaxOp, ops::AdamaxOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + AdamaxInferMetaFunctor); diff --git a/paddle/fluid/operators/optimizers/adamax_op.h b/paddle/fluid/operators/optimizers/adamax_op.h deleted file mode 100644 index df0112448b1cbc82d699dc1ee6f3444bda3b142b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/optimizers/adamax_op.h +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class AdamaxOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type()))); - const auto* grad_var = ctx.InputVar("Grad"); - PADDLE_ENFORCE_EQ(grad_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Grad").front(), - framework::ToTypeName(grad_var->Type()))); - - auto param_out_tensor = ctx.Output("ParamOut"); - auto moment_out_tensor = ctx.Output("MomentOut"); - auto inf_norm_out_tensor = ctx.Output("InfNormOut"); - - param_out_tensor->mutable_data(ctx.GetPlace()); - moment_out_tensor->mutable_data(ctx.GetPlace()); - inf_norm_out_tensor->mutable_data(ctx.GetPlace()); - - T beta1 = static_cast(ctx.Attr("beta1")); - T beta2 = static_cast(ctx.Attr("beta2")); - T epsilon = static_cast(ctx.Attr("epsilon")); - - auto param = framework::EigenVector::Flatten( - *ctx.Input("Param")); - auto grad = framework::EigenVector::Flatten( - *ctx.Input("Grad")); - auto moment = framework::EigenVector::Flatten( - *ctx.Input("Moment")); - auto inf_norm = framework::EigenVector::Flatten( - *ctx.Input("InfNorm")); - auto lr = framework::EigenVector::Flatten( - *ctx.Input("LearningRate")); - auto beta1_pow = framework::EigenVector::Flatten( - *ctx.Input("Beta1Pow")); - auto param_out = framework::EigenVector::Flatten(*param_out_tensor); - auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); - auto inf_norm_out = - framework::EigenVector::Flatten(*inf_norm_out_tensor); - auto* place = ctx.template device_context().eigen_device(); - - moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad; - inf_norm_out.device(*place) = - grad.abs().cwiseMax((beta2 * inf_norm) + epsilon); - auto lr_t = lr / (1 - beta1_pow); - Eigen::DSizes m_dsize(moment_out_tensor->numel()); - param_out.device(*place) = - param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/cast_with_ptr.h b/paddle/fluid/operators/optimizers/cast_with_ptr.h index ab8b4f2b8f4d37d4be62c5e1dd040a1461d0bdee..a3fbb0e59e24e9be67da5048ebc644f08b385bbf 100644 --- a/paddle/fluid/operators/optimizers/cast_with_ptr.h +++ b/paddle/fluid/operators/optimizers/cast_with_ptr.h @@ -57,8 +57,7 @@ static void LaunchCastKernel(const platform::CUDADeviceContext &ctx, PADDLE_ENFORCE_NE( static_cast(x), static_cast(y), platform::errors::InvalidArgument("Inplace cast is not supported yet.")); - int vec_size = - std::min(platform::GetVectorizedSize(x), platform::GetVectorizedSize(y)); + int vec_size = std::min(phi::GetVectorizedSize(x), phi::GetVectorizedSize(y)); switch (vec_size) { case 4: return details::VecCastKernel(ctx, x, y, n); diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.h b/paddle/fluid/operators/optimizers/dgc_momentum_op.h index bea019f1f36e2ea21890f23b753b4df1d62c0e3b..c86f544ed77ff13cc59735971cf856f66bc12202 100644 --- a/paddle/fluid/operators/optimizers/dgc_momentum_op.h +++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.h @@ -17,7 +17,7 @@ #include #include "paddle/fluid/operators/optimizers/momentum_op.h" -#include "paddle/fluid/operators/optimizers/sgd_op.h" +#include "paddle/phi/kernels/sgd_kernel.h" namespace paddle { namespace operators { @@ -26,8 +26,7 @@ template class DGCMomentumKernel : public framework::OpKernel { public: DGCMomentumKernel() - : _momentum_op_kernel(new MomentumOpKernel()), - _sgd_op_kernel(new SGDOpKernel()) {} + : _momentum_op_kernel(new MomentumOpKernel()) {} void Compute(const framework::ExecutionContext& context) const override { auto rampup_begin_step = context.Attr("rampup_begin_step"); @@ -67,12 +66,68 @@ class DGCMomentumKernel : public framework::OpKernel { } VLOG(10) << " so use sgd optimizer"; - return _sgd_op_kernel->Compute(context); + + const auto* param_var = context.InputVar("Param"); + const auto* grad_var = context.InputVar("Grad"); + auto* learning_rate = context.Input("LearningRate"); + bool multi_precision = context.Attr("multi_precision"); + if (param_var->IsType()) { + auto* param = context.Input("Param"); + auto* param_out = context.Output("ParamOut"); + auto* master_param_out = + context.Output("MasterParamOut"); + paddle::optional master_param_opt = + paddle::none; + if (multi_precision) { + auto* master_param = context.Input("MasterParam"); + master_param_opt = *master_param; + } + + if (grad_var->IsType()) { + // sgd_dense + auto* grad = context.Input("Grad"); + phi::SGDDenseKernel( + static_cast::TYPE&>(dev_ctx), + *param, *learning_rate, *grad, master_param_opt, multi_precision, + param_out, master_param_out); + } else { + // sgd dense param sparse grad + auto* grad = context.Input("Grad"); + phi::SGDDenseParamSparseGradKernel( + static_cast::TYPE&>(dev_ctx), + *param, *learning_rate, *grad, master_param_opt, multi_precision, + param_out, master_param_out); + } + } else if (param_var->IsType() && + grad_var->IsType() && + platform::is_cpu_place(context.GetPlace())) { + // sgd sparse param sparse grad + auto* param = context.Input("Param"); + auto* param_out = context.Output("ParamOut"); + auto* master_param_out = + context.Output("MasterParamOut"); + paddle::optional master_param_opt = + paddle::none; + if (multi_precision) { + auto* master_param = context.Input("MasterParam"); + master_param_opt = *master_param; + } + auto* grad = context.Input("Grad"); + phi::SGDSparseParamSparseGradKernel( + static_cast::TYPE&>(dev_ctx), + *param, *learning_rate, *grad, master_param_opt, multi_precision, + param_out, master_param_out); + + } else { + PADDLE_THROW("gdc not support yet"); + } } private: std::unique_ptr> _momentum_op_kernel; - std::unique_ptr> _sgd_op_kernel; }; } // namespace operators diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc index 28c6efef14178535d7f9473c2310552037952c9f..efec50efa92ea68cb68934bde32e1f56570b0868 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc @@ -61,30 +61,31 @@ class DistributedFusedLambInitOpMaker "The fp32 beta1 power accumulator tensor. Its shape is [1]."); AddOutput("Beta2Pow", "The fp32 beta2 power accumulator tensor. Its shape is [1]."); - AddOutput("FusedIndices", - "The param index of each element in FP32FusedParam. Its shape is " - "[M1+M2]. It is like [0,0,0,1,1,1,1,2,2,...]."); AddOutput( "FusedParamOffsets", "The numel offset of each parameter inside the FP32FusedParam. Its " "shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 " - "+ n_2, ...]."); - AddOutput("FP32ShardFusedParamOffsets", - "The sharded numel offset of each parameter in the local rank. " - "Its shape is [fp32_local_param_num + 1]."); - AddOutput("FP16ShardFusedParamOffsets", - "The sharded numel offset of each parameter in the local rank. " - "Its shape is [fp16_local_param_num + 1]."); + "+ n_2, ...]. It should be in CPUPlace."); AddOutput( - "WeightDecay", - "The sharded fp32 weight decay tensor. Its shape is [(M1+M2)/N]."); + "FP32ShardFusedParamOffsets", + "The sharded numel offset of each parameter in the local rank. " + "Its shape is [fp32_local_param_num + 1]. It should be in CPUPlace."); + AddOutput( + "FP16ShardFusedParamOffsets", + "The sharded numel offset of each parameter in the local rank. " + "Its shape is [fp16_local_param_num + 1]. It should be in CPUPlace."); AddOutput("ParamInfo", "The param info. It should be in CPUPlace, and its shape is [6]" - "CPUPlace, and its shape is [6]. It is " + "CPUPlace, and its shape is [8]. It is " "[fp32_shard_param_start_idx, fp32_local_param_num, " - "fp32_global_param_num, fp16_shard_param_start_idx, " - "fp16_local_param_num, fp16_global_param_num]."); - + "fp32_global_param_num, fp32_weight_decay_end_idx, " + "fp16_shard_param_start_idx, " + "fp16_local_param_num, fp16_global_param_num, " + "fp16_weight_decay_end_idx]."); + AddOutput("ParamOrder", + "The reordered parameter order. Inside this op, " + "the parameter would be reordered by data type and weight decay " + "value."); AddOutput("ParamOut", "The output parameter list.").AsDuplicable(); AddOutput("MasterParamOut", "The output master parameter list. It would share the memory of " @@ -96,10 +97,8 @@ class DistributedFusedLambInitOpMaker AddAttr("beta1", "The initial value of Beta1Pow."); AddAttr("beta2", "The initial value of Beta2Pow."); - AddAttr>( - "weight_decay", - "The weight decay for each parameter. Its " - "shape is equal to the global parameter number."); + AddAttr>("apply_weight_decay", + "Whether to apply weight decay."); AddAttr("alignment", "The alignment in bytes for the fused tensors."); AddAttr("rank", "The global rank of the current process."); AddAttr("nranks", "The global world size."); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu index 3445e9b658becda84aa678e9c1f03b3436d63b70..7d8a7186d58b402e208fc749524d996b351abeef 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu @@ -258,32 +258,6 @@ static void ShareBufferForNonInitedTensor(framework::Tensor *origin, << ") , dtype = " << fused_out->dtype(); } -template -static __global__ void LambFillFusedIndicesCUDAKernel(const OffsetT *offsets, - IndexT *out, - int offset_num, - int out_num) { - CUDA_KERNEL_LOOP_TYPE(i, out_num, int) { - auto idx = phi::funcs::LowerBound(offsets, offset_num, i); - if (idx == offset_num || offsets[idx] != i) { - --idx; - } - out[i] = idx; - } -} - -template -static void CopyVectorToTensor(const std::vector &src, - framework::Tensor *dst, - const platform::Place &place, - gpuStream_t stream) { - dst->Resize({static_cast(src.size())}); - T *dst_ptr = dst->mutable_data(place); - const T *src_ptr = src.data(); - auto nbytes = src.size() * sizeof(T); - memory::Copy(place, dst_ptr, platform::CPUPlace(), src_ptr, nbytes, stream); -} - template static void CopyVectorToCPUTensor(const std::vector &src, framework::Tensor *dst) { @@ -294,6 +268,42 @@ static void CopyVectorToCPUTensor(const std::vector &src, std::memcpy(dst_ptr, src_ptr, nbytes); } +static size_t ReorderParamGradInfoList(const std::vector &flags, + std::vector *infos) { + size_t n = infos->size(); + std::vector cur_flags; + cur_flags.reserve(n); + for (size_t i = 0; i < n; ++i) { + auto idx = (*infos)[i].idx; + cur_flags.push_back(flags[idx]); + } + + auto origin_infos = *infos; + size_t j = 0; + for (size_t i = 0; i < n; ++i) { + if (cur_flags[i]) { + (*infos)[j] = origin_infos[i]; + ++j; + } + } + size_t ret_idx = j; + + for (size_t i = 0; i < n; ++i) { + if (!cur_flags[i]) { + (*infos)[j] = origin_infos[i]; + ++j; + } + } + return ret_idx; +} + +template +static T ClipByBound(T x, T low_value, T high_value) { + if (x < low_value) return low_value; + if (x > high_value) return high_value; + return x; +} + template class DistributedFusedLambInitOpKernel : public framework::OpKernel { @@ -404,6 +414,24 @@ class DistributedFusedLambInitOpKernel info->numel_offset = 0; // not determined yet } } + const auto &apply_weight_decay = + ctx.Attr>("apply_weight_decay"); + size_t fp32_wd_end_idx = + ReorderParamGradInfoList(apply_weight_decay, &fp32_infos); + size_t fp16_wd_end_idx = + ReorderParamGradInfoList(apply_weight_decay, &fp16_infos); + + auto *param_order_t = ctx.Output("ParamOrder"); + auto param_num = fp32_infos.size() + fp16_infos.size(); + param_order_t->Resize({static_cast(param_num)}); + auto *param_order = param_order_t->mutable_data(platform::CPUPlace()); + for (size_t i = 0; i < fp32_infos.size(); ++i) { + param_order[i] = static_cast(fp32_infos[i].idx); + } + for (size_t i = 0; i < fp16_infos.size(); ++i) { + param_order[i + fp32_infos.size()] = static_cast(fp16_infos[i].idx); + } + VLOG(10) << "Fill ParamGradInfo ends"; // Step 2: determine the numel_with_padding and numel_offset @@ -568,45 +596,29 @@ class DistributedFusedLambInitOpKernel VLOG(10) << "Found the sharding arguments"; auto *param_info_t = ctx.Output("ParamInfo"); - param_info_t->Resize({6}); + param_info_t->Resize({8}); auto *param_info = param_info_t->mutable_data(platform::CPUPlace()); param_info[0] = static_cast(fp32_start_idx); param_info[1] = static_cast(fp32_local_param_num); param_info[2] = static_cast(fp32_infos.size()); - param_info[3] = static_cast(fp16_start_idx + fp32_infos.size()); - param_info[4] = static_cast(fp16_local_param_num); - param_info[5] = static_cast(fp16_infos.size()); + param_info[3] = ClipByBound(fp32_wd_end_idx, fp32_start_idx, + fp32_start_idx + fp32_local_param_num) - + static_cast(fp32_start_idx); + param_info[4] = static_cast(fp16_start_idx + fp32_infos.size()); + param_info[5] = static_cast(fp16_local_param_num); + param_info[6] = static_cast(fp16_infos.size()); + param_info[7] = ClipByBound(fp16_wd_end_idx, fp16_start_idx, + fp16_start_idx + fp16_local_param_num) - + static_cast(fp16_start_idx); VLOG(10) << "Start FP32 idx: " << param_info[0]; VLOG(10) << "Local FP32 param num: " << param_info[1]; VLOG(10) << "Global FP32 param num: " << param_info[2]; - VLOG(10) << "Start FP16 idx: " << param_info[3]; - VLOG(10) << "Local FP16 param num: " << param_info[4]; - VLOG(10) << "Global FP16 param num: " << param_info[5]; + VLOG(10) << "Start FP16 idx: " << param_info[4]; + VLOG(10) << "Local FP16 param num: " << param_info[5]; + VLOG(10) << "Global FP16 param num: " << param_info[6]; - // For WeightDecay, shard and perform H2D copy - const auto &origin_weight_decay = - ctx.Attr>("weight_decay"); - PADDLE_ENFORCE_EQ(params.size(), origin_weight_decay.size(), - platform::errors::InvalidArgument( - "The attr(weight_decay) should have the " - "same length with Input(Param).")); - std::vector shard_weight_decay; - shard_weight_decay.reserve(total_local_param_num); - for (size_t i = 0; i < fp32_local_param_num; ++i) { - shard_weight_decay.push_back( - origin_weight_decay[fp32_infos[i + fp32_start_idx].idx]); - } - for (size_t i = 0; i < fp16_local_param_num; ++i) { - shard_weight_decay.push_back( - origin_weight_decay[fp16_infos[i + fp16_start_idx].idx]); - } - - // For FusedIndices, launch CUDA kernel to do binary search - auto *fused_indices_t = ctx.Output("FusedIndices"); - fused_indices_t->Resize({static_cast(total_numel)}); - auto *fused_indices = fused_indices_t->mutable_data(place); std::vector numel_offsets; numel_offsets.reserve(params.size() + 1); for (const auto &info : fp32_infos) { @@ -621,21 +633,6 @@ class DistributedFusedLambInitOpKernel "The numel_offsets number must be one larger than " "the parameter number.")); VLOG(10) << "Total numel offset: " << FlattenToString(numel_offsets); - auto *fused_param_offset_t = - ctx.Output("FusedParamOffsets"); - fused_param_offset_t->Resize({static_cast(numel_offsets.size())}); - auto *fused_param_offset = fused_param_offset_t->mutable_data(place); - memory::Copy(place, fused_param_offset, platform::CPUPlace(), - numel_offsets.data(), - numel_offsets.size() * sizeof(numel_offsets[0]), stream); - auto config = platform::GetGpuLaunchConfig1D(dev_ctx, total_numel); - LambFillFusedIndicesCUDAKernel<<>>( - fused_param_offset, fused_indices, numel_offsets.size() - 1, - total_numel); - - std::vector lengths; - lengths.reserve(fp32_local_param_num + fp16_local_param_num); std::vector fp32_partial_numel_offsets; fp32_partial_numel_offsets.reserve(fp32_local_param_num + 1); @@ -659,9 +656,9 @@ class DistributedFusedLambInitOpKernel VLOG(10) << "FP32 Partial numel = [" << valid_start_n + fp32_infos[i].numel << "," << end_n + fp32_infos[i].numel; - lengths.push_back(end_n - valid_start_n); + auto len = end_n - valid_start_n; fp32_partial_numel_offsets.push_back(fp32_partial_numel_offsets.back() + - lengths.back()); + len); } std::vector fp16_partial_numel_offsets; @@ -682,9 +679,9 @@ class DistributedFusedLambInitOpKernel PADDLE_ENFORCE_NE(valid_start_n, end_n, platform::errors::InvalidArgument( "Indices sharding error. This may be a bug.")); - lengths.push_back(end_n - valid_start_n); + auto len = end_n - valid_start_n; fp16_partial_numel_offsets.push_back(fp16_partial_numel_offsets.back() + - lengths.back()); + len); } CopyVectorToCPUTensor(numel_offsets, @@ -696,23 +693,6 @@ class DistributedFusedLambInitOpKernel fp16_partial_numel_offsets, ctx.Output("FP16ShardFusedParamOffsets")); - // Fill the weight decay tensor - PADDLE_ENFORCE_EQ(lengths.size(), shard_weight_decay.size(), - platform::errors::InvalidArgument( - "Invalid weight decay sharding. This may be a bug.")); - std::vector wd_cpu; - for (size_t i = 0; i < shard_weight_decay.size(); ++i) { - int len = lengths[i]; - for (int j = 0; j < len; ++j) { - wd_cpu.push_back(shard_weight_decay[i]); - } - } - PADDLE_ENFORCE_EQ(wd_cpu.size() * nranks, fp32_numel + fp16_numel, - platform::errors::InvalidArgument( - "Invalid weight decay sharding. This may be a bug.")); - CopyVectorToTensor(wd_cpu, ctx.Output("WeightDecay"), - place, stream); - auto *global_scale = ctx.Output("GlobalScale"); if (!global_scale->IsInitialized()) { TensorFillConstant(dev_ctx, global_scale, {1}, 1.0f); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc index e5b27446eb330aeb08e134332a5366c6c6ed2908..8f7c87912e93aa1bb3178d37afa641047e15a82b 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc @@ -66,28 +66,31 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker { "The fp32 beta1 power accumulator tensor. Its shape is [1]."); AddInput("Beta2Pow", "The fp32 beta2 power accumulator tensor. Its shape is [1]."); - AddInput("FusedIndices", - "The param index of each element in FP32FusedParam. Its shape is " - "[M1+M2]. It is like [0,0,0,1,1,1,1,2,2,...]."); AddInput( "FusedParamOffsets", "The numel offset of each parameter inside the FP32FusedParam. Its " "shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 " - "+ n_2, ...]."); - AddInput("FP32ShardFusedParamOffsets", - "The sharded numel offset of each parameter in the local rank. " - "Its shape is [fp32_local_param_num + 1]."); - AddInput("FP16ShardFusedParamOffsets", - "The sharded numel offset of each parameter in the local rank. " - "Its shape is [fp16_local_param_num + 1]."); - AddInput("WeightDecay", - "The sharded fp32 weight decay tensor. Its shape is [(M1+M2)/N]."); + "+ n_2, ...]. It should be in CPUPlace."); + AddInput( + "FP32ShardFusedParamOffsets", + "The sharded numel offset of each parameter in the local rank. " + "Its shape is [fp32_local_param_num + 1]. It should be in CPUPlace."); + AddInput( + "FP16ShardFusedParamOffsets", + "The sharded numel offset of each parameter in the local rank. " + "Its shape is [fp16_local_param_num + 1]. It should be in CPUPlace."); AddInput("ParamInfo", "The param info. It should be in CPUPlace, and its shape is [6]" - "CPUPlace, and its shape is [6]. It is " + "CPUPlace, and its shape is [8]. It is " "[fp32_shard_param_start_idx, fp32_local_param_num, " - "fp32_global_param_num, fp16_shard_param_start_idx, " - "fp16_local_param_num, fp16_global_param_num]."); + "fp32_global_param_num, fp32_weight_decay_end_idx, " + "fp16_shard_param_start_idx, " + "fp16_local_param_num, fp16_global_param_num, " + "fp16_weight_decay_end_idx]."); + AddInput("ParamOrder", + "The reordered parameter order. Inside this op, " + "the parameter would be reordered by data type and weight decay " + "value."); AddInput("LearningRate", "The fp32 learning rate tensor. Its shape is [1]."); @@ -116,6 +119,7 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker { "max_global_grad_norm", "The maximum global gradient l2-norm value for clipping. If " "max_global_grad_norm <= 0, no clipping would be performed."); + AddAttr("weight_decay", "The weight decay value."); AddAttr("clip_after_allreduce", "Whether to clip before allreduce, only valid when the " "world size is larger than 1."); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index 3f90140f77282983f42ef03f736c35960239dd75..5b60f65442b55dc89a845859f153048e89704f70 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -19,11 +19,11 @@ #include "paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h" #include "paddle/fluid/operators/optimizers/multi_tensor_apply.h" #include "paddle/fluid/operators/tensor_to_string.h" -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/string/string_helper.h" #include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" #ifdef __NVCC__ #include "cub/cub.cuh" @@ -66,8 +66,8 @@ struct L2NormFunctor { int i; for (i = threadIdx.x * VecSize; i + VecSize <= size; i += (BlockDim * VecSize)) { - platform::AlignedVector tmp_vec; - platform::Load(ptr + i, &tmp_vec); + phi::AlignedVector tmp_vec; + phi::Load(ptr + i, &tmp_vec); #pragma unroll for (int j = 0; j < VecSize; ++j) { auto tmp = static_cast(tmp_vec[j]); @@ -87,7 +87,7 @@ struct L2NormFunctor { } }; -template +template static __global__ void MultiTensorL2NormReduceAgainCUDAKernel( const InT *x, OutT *y, int max_chunk_num) { int tensor_id = blockIdx.x; @@ -100,11 +100,7 @@ static __global__ void MultiTensorL2NormReduceAgainCUDAKernel( } sum = BlockReduce(storage).Reduce(sum, cub::Sum()); if (threadIdx.x == 0) { - if (NeedSqrt) { - y[blockIdx.x] = static_cast(sqrtf(sum)); - } else { - y[blockIdx.x] = static_cast(sum); - } + y[blockIdx.x] = static_cast(sum); } } @@ -115,9 +111,10 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) { constexpr int max_load_bits = 128; int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T); auto address = reinterpret_cast(ptr); - constexpr int vec8 = alignof(platform::AlignedVector); - constexpr int vec4 = alignof(platform::AlignedVector); - constexpr int vec2 = alignof(platform::AlignedVector); + constexpr int vec8 = alignof(phi::AlignedVector); + constexpr int vec4 = alignof(phi::AlignedVector); + constexpr int vec2 = alignof(phi::AlignedVector); + chunk_size *= sizeof(T); if (address % vec8 == 0 && chunk_size % vec8 == 0) { return std::min(8, valid_vec_size); } else if (address % vec4 == 0 && chunk_size % vec4 == 0) { @@ -129,27 +126,26 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) { } } -#define PD_VEC_MULTI_TENSOR_APPLY_CASE(__vec_size, ...) \ - case __vec_size: { \ - constexpr int kVecSize = __vec_size; \ - __VA_ARGS__; \ - break; \ +#define PD_VEC_LAUNCH_KERNEL_CASE(__vec_size, ...) \ + case __vec_size: { \ + constexpr int kVecSize = __vec_size; \ + __VA_ARGS__; \ + break; \ } -#define PD_VEC_MULTI_TENSOR_APPLY(__vec_size, ...) \ - do { \ - switch (__vec_size) { \ - PD_VEC_MULTI_TENSOR_APPLY_CASE(8, __VA_ARGS__); \ - PD_VEC_MULTI_TENSOR_APPLY_CASE(4, __VA_ARGS__); \ - PD_VEC_MULTI_TENSOR_APPLY_CASE(2, __VA_ARGS__); \ - PD_VEC_MULTI_TENSOR_APPLY_CASE(1, __VA_ARGS__); \ - } \ +#define PD_VEC_LAUNCH_KERNEL(__vec_size, ...) \ + do { \ + switch (__vec_size) { \ + PD_VEC_LAUNCH_KERNEL_CASE(8, __VA_ARGS__); \ + PD_VEC_LAUNCH_KERNEL_CASE(4, __VA_ARGS__); \ + PD_VEC_LAUNCH_KERNEL_CASE(2, __VA_ARGS__); \ + PD_VEC_LAUNCH_KERNEL_CASE(1, __VA_ARGS__); \ + } \ } while (0) // TODO(zengjinle): which chunk_size is better? -template +template static void MultiTensorL2Norm(const platform::CUDAPlace &place, gpuStream_t stream, const InT *x, const int *offsets, int n, OutT *y, @@ -158,7 +154,7 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place, constexpr int kNumTensor = MaxTensorNumPerLaunch; constexpr int kNumChunk = MaxChunkNumPerLaunch; - constexpr int kBlockDim = BlockDim; + constexpr int kBlockDim = 512; int max_chunk_num = -1; int vec_size = 8; @@ -181,22 +177,22 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place, auto *tmp_out_ptr = tmp_out.Alloc(n * max_chunk_num); FillZeroWithPtr(tmp_out_ptr, n * max_chunk_num, stream); -#define PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL \ - do { \ - using FunctorT = L2NormFunctor; \ - VLOG(10) << __func__ << " " << typeid(InT).name() \ - << " VecSize = " << kVecSize; \ - MultiTensorApply( \ - FunctorT(), stream, offsets, n, chunk_size, x, tmp_out_ptr, \ - max_chunk_num); \ +#define PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL \ + do { \ + using FunctorT = L2NormFunctor; \ + VLOG(10) << __func__ << " " << typeid(InT).name() \ + << " VecSize = " << kVecSize; \ + MultiTensorApply( \ + FunctorT(), stream, offsets, n, chunk_size, kBlockDim, x, tmp_out_ptr, \ + max_chunk_num); \ } while (0) - PD_VEC_MULTI_TENSOR_APPLY(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL); -#undef PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL + PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL); +#undef PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL - MultiTensorL2NormReduceAgainCUDAKernel<<>>( - tmp_out_ptr, y, max_chunk_num); + MultiTensorL2NormReduceAgainCUDAKernel< + MT, OutT, kBlockDim><<>>(tmp_out_ptr, y, + max_chunk_num); } template @@ -208,34 +204,17 @@ static void LogParamAndTrustRatioDivSquareNorm( auto tensors = ctx.MultiInput("Param"); if (tensors.empty()) return; + const auto *order = ctx.Input("ParamOrder")->data(); + size_t n = tensors.size(); auto place = tensors[0]->place(); auto pn_vec = ToVector(param_square_norm, n, place); auto tn_vec = ToVector(trust_ratio_div_square_norm, n, place); - std::vector fp32_indices, fp16_indices; - fp32_indices.reserve(n); - fp16_indices.reserve(n); - for (size_t i = 0; i < n; ++i) { - const auto *t = tensors[i]; - if (t->dtype() == phi::DataType::FLOAT32) { - fp32_indices.push_back(i); - } else if (t->dtype() == phi::DataType::FLOAT16) { - fp16_indices.push_back(i); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported data type %s.", t->dtype())); - } - } - - for (auto idx : fp16_indices) { - fp32_indices.push_back(idx); - } - const auto &names = ctx.GetOp().Inputs("Param"); - for (size_t i = 0; i < fp32_indices.size(); ++i) { - auto idx = fp32_indices[i]; + for (size_t i = 0; i < n; ++i) { + auto idx = order[i]; VLOG(LogLevel) << "Param " << tensors[idx]->dtype() << " " << names[idx] << " pn = " << pn_vec[i] << " , tn = " << tn_vec[i]; } @@ -325,14 +304,30 @@ struct AndFunctor { HOSTDEVICE bool operator()(bool x, bool y) const { return x && y; } }; -template +template static __global__ void ScaleCUDAKernel(const T1 *__restrict__ x, const T2 *__restrict__ scale, T1 *__restrict__ y, int num) { static_assert(sizeof(T1) <= sizeof(T2), "sizeof(T1) must be not greater than sizeof(T2)."); T2 s = scale[0]; - CUDA_KERNEL_LOOP(i, num) { + + int i = (threadIdx.x + blockIdx.x * blockDim.x) * VecSize; + int stride = blockDim.x * gridDim.x * VecSize; + + for (; i + VecSize <= num; i += stride) { + phi::AlignedVector x_vec; + phi::AlignedVector y_vec; + + phi::Load(x + i, &x_vec); +#pragma unroll + for (int j = 0; j < VecSize; ++j) { + y_vec[j] = static_cast(static_cast(x_vec[j]) * s); + } + phi::Store(y_vec, y + i); + } + + for (; i < num; ++i) { y[i] = static_cast(static_cast(x[i]) * s); } } @@ -353,7 +348,7 @@ static __global__ void CalcGradNormClipBeforeAllReduceScale( const T1 *__restrict__ global_scale, T1 max_global_grad_norm, const T1 *__restrict__ square_grad_norm, T1 *__restrict__ out1, T2 *__restrict__ out2, T1 clip_rescale_grad) { - T1 grad_norm = static_cast(sqrt(*square_grad_norm)) * clip_rescale_grad; + T1 grad_norm = static_cast(sqrtf(*square_grad_norm)) * clip_rescale_grad; T1 scale = global_scale[0] * max_global_grad_norm / (1e-6 + grad_norm); bool found_nan_inf = !isfinite(scale); if (scale >= 1 || found_nan_inf) { @@ -380,19 +375,24 @@ static __global__ void SetNanInfValueCUDAKernelTwoFlag(const bool *in_flag_p_1, ((*in_flag_p_1) || (*in_flag_p_2)) ? __int_as_float(0x7fffffffU) : 0.0f; } -// TODO(zengjinle): Vectorize this function -// NOTE: this method does not update Beta1Pow and Beta2Pow! -template -static __global__ void UpdateLambMoment( +template +static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel( const T *__restrict__ param_p, const GradT *__restrict__ grad_p, const T *__restrict__ square_grad_norm_p, - const T *__restrict__ global_scale, const IndexT *__restrict__ indices, - const T *__restrict__ weight_decay_p, const T *__restrict__ beta1pow_p, + const T *__restrict__ global_scale, const T *__restrict__ beta1pow_p, const T *__restrict__ beta2pow_p, T *__restrict__ mom1_p, - T *__restrict__ mom2_p, T *__restrict__ trust_ratio_div_p, T beta1, T beta2, - T epsilon, T max_global_grad_norm, int num, T rescale_grad) { + T *__restrict__ mom2_p, T *__restrict__ trust_ratio_div_p, bool *found_inf, + T weight_decay, int weight_decay_end_numel, T beta1, T beta2, T epsilon, + T max_global_grad_norm, int num, T rescale_grad) { T square_grad_norm = *square_grad_norm_p; - if (!isfinite(square_grad_norm)) return; + bool need_update_found_inf = + (found_inf && threadIdx.x == 0 && blockIdx.x == 0); + if (!isfinite(square_grad_norm)) { + if (need_update_found_inf) *found_inf = true; + return; + } else if (need_update_found_inf) { + *found_inf = false; + } T scale = rescale_grad / global_scale[0]; if (max_global_grad_norm > 0) { @@ -406,27 +406,111 @@ static __global__ void UpdateLambMoment( T one_minus_beta1pow = 1 - beta1pow_p[0]; T one_minus_beta2pow = 1 - beta2pow_p[0]; - CUDA_KERNEL_LOOP(i, num) { - T p = param_p[i]; - T g = static_cast(grad_p[i]) * scale; - T weight_decay = weight_decay_p[i]; - T mom1 = mom1_p[i]; - T mom2 = mom2_p[i]; + int i = (threadIdx.x + blockIdx.x * blockDim.x) * VecSize; + int stride = blockDim.x * gridDim.x * VecSize; - mom1 = beta1 * mom1 + (1 - beta1) * g; - mom2 = beta2 * mom2 + (1 - beta2) * g * g; + for (; i + VecSize <= num; i += stride) { + phi::AlignedVector param_vec; + phi::AlignedVector grad_vec; + phi::AlignedVector mom1_vec; + phi::AlignedVector mom2_vec; + phi::AlignedVector trust_ratio_div_vec; - T mom1_unbiased = mom1 / one_minus_beta1pow; - T mom2_unbiased = mom2 / one_minus_beta2pow; - T trust_ratio_div = - mom1_unbiased / (sqrtf(mom2_unbiased) + epsilon) + weight_decay * p; + T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay; + if (cur_weight_decay != static_cast(0.0)) { + phi::Load(param_p + i, ¶m_vec); + } else { +#pragma unroll + for (int j = 0; j < VecSize; ++j) { + param_vec[j] = static_cast(0); + } + } + phi::Load(grad_p + i, &grad_vec); + phi::Load(mom1_p + i, &mom1_vec); + phi::Load(mom2_p + i, &mom2_vec); + +#define PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(__param, __grad, __mom1, __mom2, \ + __trust_ratio_div, __idx) \ + T p = __param[__idx]; \ + T g = static_cast(__grad[__idx]) * scale; \ + T mom1 = __mom1[__idx]; \ + T mom2 = __mom2[__idx]; \ + mom1 = beta1 * mom1 + (1 - beta1) * g; \ + mom2 = beta2 * mom2 + (1 - beta2) * g * g; \ + T mom1_unbiased = mom1 / one_minus_beta1pow; \ + T mom2_unbiased = mom2 / one_minus_beta2pow; \ + __trust_ratio_div[__idx] = \ + mom1_unbiased / (sqrtf(mom2_unbiased) + epsilon) + cur_weight_decay * p; \ + __mom1[__idx] = mom1; \ + __mom2[__idx] = mom2; + +#pragma unroll + for (int j = 0; j < VecSize; ++j) { + PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(param_vec, grad_vec, mom1_vec, + mom2_vec, trust_ratio_div_vec, j); + } + + phi::Store(mom1_vec, mom1_p + i); + phi::Store(mom2_vec, mom2_p + i); + phi::Store(trust_ratio_div_vec, trust_ratio_div_p + i); + } - mom1_p[i] = mom1; - mom2_p[i] = mom2; - trust_ratio_div_p[i] = trust_ratio_div; + for (; i < num; ++i) { + T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay; + PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(param_p, grad_p, mom1_p, mom2_p, + trust_ratio_div_p, i); } } +template +static void MultiTensorUpdateLambMomentAndTrustRatioDiv( + const platform::CUDADeviceContext &dev_ctx, const int *offsets, int n, + const T *param_p, const GradT *grad_p, const T *square_grad_norm_p, + const T *global_scale, const T *beta1pow_p, const T *beta2pow_p, T *mom1_p, + T *mom2_p, T *trust_ratio_div_p, bool *found_inf_p, T weight_decay, + int weight_decay_end_idx, T beta1, T beta2, T epsilon, + T max_global_grad_norm, T rescale_grad) { + if (n <= 0) return; + int numel = offsets[n] - offsets[0]; + PADDLE_ENFORCE_GE(weight_decay_end_idx, 0, + platform::errors::InvalidArgument( + "The weight decay end index should be >= 0.")); + PADDLE_ENFORCE_LE(weight_decay_end_idx, n, + platform::errors::InvalidArgument( + "The weight decay end index should be < %d.", n)); + auto weight_decay_end_numel = offsets[weight_decay_end_idx] - offsets[0]; + + int vec_size = GetChunkedVecSize(param_p, 0); + vec_size = std::min(vec_size, GetChunkedVecSize(grad_p, 0)); + vec_size = std::min(vec_size, GetChunkedVecSize(mom1_p, 0)); + vec_size = std::min(vec_size, GetChunkedVecSize(mom2_p, 0)); + vec_size = std::min(vec_size, GetChunkedVecSize(trust_ratio_div_p, 0)); + for (int i = 0; i < n; ++i) { + auto length = offsets[i + 1] - offsets[i]; + while (length % vec_size != 0) { + vec_size /= 2; + } + } + + VLOG(1) << __func__ << " VecSize = " << vec_size; + + auto stream = dev_ctx.stream(); + auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size); + +#define PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL \ + do { \ + UpdateLambMomentAndTrustRatioDivCUDAKernel<<< \ + config.block_per_grid, config.thread_per_block, 0, stream>>>( \ + param_p, grad_p, square_grad_norm_p, global_scale, beta1pow_p, \ + beta2pow_p, mom1_p, mom2_p, trust_ratio_div_p, found_inf_p, \ + weight_decay, weight_decay_end_numel, beta1, beta2, epsilon, \ + max_global_grad_norm, numel, rescale_grad); \ + } while (0) + + PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL); +#undef PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL +} + template struct LambBetaPowUpdateOnceHelper { LambBetaPowUpdateOnceHelper(T *beta1pow, T *beta2pow, T beta1, T beta2) { @@ -468,33 +552,6 @@ struct LambBetaPowUpdateOnceHelper { HOSTDEVICE void UpdateBetaPows() const {} }; -template -struct LambFoundInfHelper { - public: - explicit LambFoundInfHelper(bool *found_inf) : found_inf_(found_inf) { - PADDLE_ENFORCE_NOT_NULL(found_inf, - platform::errors::InvalidArgument( - "The found_inf should not be nullptr.")); - } - - HOSTDEVICE void UpdateFoundInf(bool value) { *found_inf_ = value; } - - private: - bool *__restrict__ found_inf_; -}; - -template <> -struct LambFoundInfHelper { - public: - explicit LambFoundInfHelper(bool *found_inf) { - PADDLE_ENFORCE_EQ( - found_inf, nullptr, - platform::errors::InvalidArgument("The found_inf should be nullptr.")); - } - - HOSTDEVICE void UpdateFoundInf(bool) {} -}; - template struct LambParamHelper { LambParamHelper(T *param, MasterT *master_param) { @@ -509,12 +566,9 @@ struct LambParamHelper { master_param_ = master_param; } - HOSTDEVICE void SetParam(int i, MasterT updated_p) { - param_[i] = static_cast(updated_p); - master_param_[i] = updated_p; - } + HOSTDEVICE T *__restrict__ ParamPtr() { return param_; } - HOSTDEVICE MasterT GetParam(int i) { return master_param_[i]; } + HOSTDEVICE MasterT *__restrict__ MasterParamPtr() { return master_param_; } private: T *__restrict__ param_; @@ -538,158 +592,169 @@ struct LambParamHelper { param_ = param; } - HOSTDEVICE void SetParam(int i, MasterT updated_p) { - param_[i] = static_cast(updated_p); - } + HOSTDEVICE T *__restrict__ ParamPtr() { return param_; } - HOSTDEVICE MasterT GetParam(int i) { - return static_cast>(param_[i]); - } + HOSTDEVICE constexpr MasterT *MasterParamPtr() { return nullptr; } private: T *__restrict__ param_; }; -template -struct LambParamAndBetaPowsUpdateHelper - : public LambParamHelper, - public LambBetaPowUpdateOnceHelper, NeedUpdateBetaPow>, - public LambFoundInfHelper { - LambParamAndBetaPowsUpdateHelper( - ParamT *param, MasterT *master_param, MasterT *beta1pow, - MasterT *beta2pow, MasterT beta1, MasterT beta2, - bool *found_inf, const MasterT *trust_ratio_div, - const MasterT *lr, const IndexT *index, +template +struct LambUpdateParamAndBetaPowsFunctor { + DEVICE void operator()( + int tensor_id, int chunk_id, int offset, int size, + LambParamHelper param_helper, + const MasterT *trust_ratio_div, const MasterT *lr, const MasterT *param_square_norm, - const MasterT *trust_ratio_div_square_norm, - const MasterT *update_flag) - : LambParamHelper(param, master_param), - LambBetaPowUpdateOnceHelper, NeedUpdateBetaPow>( - beta1pow, beta2pow, beta1, beta2), - LambFoundInfHelper(found_inf), - trust_ratio_div(trust_ratio_div), - lr(lr), - index(index), - param_square_norm(param_square_norm), - trust_ratio_div_square_norm(trust_ratio_div_square_norm), - update_flag(update_flag) {} - - const MasterT *__restrict__ trust_ratio_div; - const MasterT *__restrict__ lr; - const IndexT *__restrict__ index; - const MasterT *__restrict__ param_square_norm; - const MasterT *__restrict__ trust_ratio_div_square_norm; - const MasterT *__restrict__ update_flag; -}; + const MasterT *trust_ratio_div_square_norm, const bool *found_inf, + LambBetaPowUpdateOnceHelper, NeedUpdateBetaPow> + betapow_helper) const { + if (*found_inf) return; -template -static __global__ void LambUpdateParamAndBetaPowsCUDAKernel( - LambParamAndBetaPowsUpdateHelper - args, - int num) { - auto should_update = *args.update_flag; - if (!isfinite(should_update)) { - if (HasFoundInf && threadIdx.x == 0 && blockIdx.x == 0) { - args.UpdateFoundInf(true); + using MT = MasterT; + + MT p_square_norm = param_square_norm[tensor_id]; + MT t_square_norm = trust_ratio_div_square_norm[tensor_id]; + MT lr_value = *lr; + MT ratio = (p_square_norm != static_cast(0) && + t_square_norm != static_cast(0) + ? lr_value * sqrtf(p_square_norm / t_square_norm) + : lr_value); + + int i; + int stride = blockDim.x * VecSize; + + ParamT *param = param_helper.ParamPtr() + offset; + MT *master_param = HasMasterParam ? param_helper.MasterParamPtr() + offset + : param_helper.MasterParamPtr(); + trust_ratio_div += offset; + + for (i = threadIdx.x * VecSize; i + VecSize <= size; i += stride) { + phi::AlignedVector trust_ratio_div_vec; + phi::Load(trust_ratio_div + i, &trust_ratio_div_vec); + if (HasMasterParam) { + phi::AlignedVector master_param_vec; + phi::Load(master_param + i, &master_param_vec); + phi::AlignedVector param_vec; +#pragma unroll + for (int j = 0; j < VecSize; ++j) { + MT p = master_param_vec[j] - ratio * trust_ratio_div_vec[j]; + master_param_vec[j] = p; + param_vec[j] = static_cast(p); + } + phi::Store(master_param_vec, master_param + i); + phi::Store(param_vec, param + i); + } else { + phi::AlignedVector param_vec; + phi::Load(param + i, ¶m_vec); +#pragma unroll + for (int j = 0; j < VecSize; ++j) { + MT p = static_cast(param_vec[j]) - ratio * trust_ratio_div_vec[j]; + param_vec[j] = static_cast(p); + } + phi::Store(param_vec, param + i); + } + } + + for (; i < size; ++i) { + if (HasMasterParam) { + MT p = master_param[i] - ratio * trust_ratio_div[i]; + master_param[i] = p; + param[i] = static_cast(p); + } else { + MT p = static_cast(param[i]) - ratio * trust_ratio_div[i]; + param[i] = static_cast(p); + } + } + + if (NeedUpdateBetaPow && threadIdx.x == 0 && blockIdx.x == 0) { + betapow_helper.UpdateBetaPows(); } - return; - } else if (HasFoundInf && threadIdx.x == 0 && blockIdx.x == 0) { - args.UpdateFoundInf(false); } +}; - if (NeedUpdateBetaPow && threadIdx.x == 0 && blockIdx.x == 0) { - args.UpdateBetaPows(); +// TODO(zengjinle): which block_dim and chunk_size would be better? +template +static void MultiTensorUpdateLambParamAndBetaPows( + const platform::CUDADeviceContext &dev_ctx, const int *offsets, int n, + const MasterT *trust_ratio_div, const MasterT *lr, + const MasterT *param_square_norm, + const MasterT *trust_ratio_div_square_norm, const bool *found_inf, + ParamT *param, MasterT *master_param, MasterT *beta1pow, + MasterT *beta2pow, MasterT beta1, MasterT beta2, + int chunk_size = 65536) { + constexpr bool kHasMasterParam = + !(std::is_same>::value); + + bool has_beta_pow = (beta1pow != nullptr); + if (has_beta_pow) { + PADDLE_ENFORCE_NOT_NULL(beta2pow, platform::errors::InvalidArgument( + "Beta2Pow should not be nullptr.")); + } else { + PADDLE_ENFORCE_EQ(beta2pow, nullptr, platform::errors::InvalidArgument( + "Beta2Pow should be nullptr.")); } - using MT = MasterT; + const int block_dim = 512; - MT lr_value = *args.lr; - CUDA_KERNEL_LOOP(i, num) { - MT p = args.GetParam(i); - MT t = args.trust_ratio_div[i]; - auto norm_idx = args.index[i]; - MT p_square_norm = args.param_square_norm[norm_idx]; - MT t_square_norm = args.trust_ratio_div_square_norm[norm_idx]; + int vec_size = 8; + for (int i = 0; i < n; ++i) { + int offset = offsets[i] - offsets[0]; + vec_size = + std::min(vec_size, GetChunkedVecSize(param + offset, chunk_size)); + if (kHasMasterParam) { + vec_size = std::min(vec_size, + GetChunkedVecSize(master_param + offset, chunk_size)); + } + vec_size = std::min( + vec_size, GetChunkedVecSize(trust_ratio_div + offset, chunk_size)); + } - MT p_norm = static_cast(sqrtf(p_square_norm)); - MT t_norm = static_cast(sqrtf(t_square_norm)); + VLOG(1) << __func__ << " VecSize = " << vec_size; - auto update = (p_norm != static_cast(0) && t_norm != static_cast(0)) - ? p_norm / t_norm - : static_cast(1); + constexpr auto kNumTensor = MaxTensorNumPerLaunch; + constexpr auto kNumChunk = MaxChunkNumPerLaunch; - MT updated_p = p - lr_value * update * t; - args.SetParam(i, updated_p); - } -} + auto stream = dev_ctx.stream(); +#define PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(__has_beta_pow) \ + do { \ + using FunctorT = \ + LambUpdateParamAndBetaPowsFunctor; \ + LambParamHelper param_helper(param, \ + master_param); \ + LambBetaPowUpdateOnceHelper, __has_beta_pow> \ + betapow_helper(beta1pow, beta2pow, beta1, beta2); \ + launcher.Launch(FunctorT(), param_helper, trust_ratio_div, lr, \ + param_square_norm, trust_ratio_div_square_norm, found_inf, \ + betapow_helper); \ + } while (0) -template -static void LambUpdateParamAndBetaPows( - const platform::CUDADeviceContext &dev_ctx, - const MasterT *trust_ratio_div, const MasterT *lr, - const IndexT *index, const MasterT *param_square_norm, - const MasterT *trust_ratio_div_square_norm, - const MasterT *update_flag, MasterT **beta1pow, - MasterT **beta2pow, bool **found_inf, MasterT beta1, - MasterT beta2, int num, ParamT *param, - MasterT *master_param, gpuStream_t stream) { - if (num == 0) return; - - bool has_master_param = !(std::is_same>::value); - auto has_beta_pow = (*beta1pow) != nullptr && (*beta2pow) != nullptr; - auto has_found_inf = (*found_inf) != nullptr; - -#define PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL( \ - __has_master_param, __has_beta_pow, __has_found_inf) \ - do { \ - LambParamAndBetaPowsUpdateHelper \ - helper(param, master_param, *beta1pow, *beta2pow, beta1, beta2, \ - *found_inf, trust_ratio_div, lr, index, param_square_norm, \ - trust_ratio_div_square_norm, update_flag); \ - auto config = platform::GetGpuLaunchConfig1D(dev_ctx, num); \ - LambUpdateParamAndBetaPowsCUDAKernel<<< \ - config.block_per_grid, config.thread_per_block, 0, stream>>>(helper, \ - num); \ +#define PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE \ + do { \ + auto callback = [&]( \ + const MultiTensorLauncher &launcher, \ + int launch_n) { \ + if (has_beta_pow && launch_n == 0) { \ + PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(true); \ + beta1pow = nullptr; \ + beta2pow = nullptr; \ + } else { \ + PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(false); \ + } \ + }; \ + MultiTensorApplyWithCallback( \ + stream, offsets, n, chunk_size, block_dim, callback); \ } while (0) - if (has_master_param) { - if (has_beta_pow) { - if (has_found_inf) { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, true, true); - } else { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, true, false); - } - } else { - if (has_found_inf) { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, false, true); - } else { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, false, false); - } - } - } else { - if (has_beta_pow) { - if (has_found_inf) { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, true, true); - } else { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, true, false); - } - } else { - if (has_found_inf) { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, false, true); - } else { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, false, false); - } - } - } + PD_VEC_LAUNCH_KERNEL(vec_size, + PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE); - *beta1pow = nullptr; - *beta2pow = nullptr; - *found_inf = nullptr; -#undef PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL +#undef PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW +#undef PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE } #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) @@ -710,6 +775,24 @@ static bool CreatePreMulScaleOpIfSupported(ncclDataType_t dtype, return false; } +template +static void LaunchScaleKernel(const platform::CUDADeviceContext &dev_ctx, + const T1 *x, const T2 *scale, T1 *y, int n, + gpuStream_t stream) { + int vec_size = std::min(GetChunkedVecSize(x, 0), GetChunkedVecSize(y, 0)); + auto config = platform::GetGpuLaunchConfig1D(dev_ctx, n, vec_size); + +#define PD_LAMB_VEC_SCALE_KERNEL_CASE \ + do { \ + ScaleCUDAKernel<<>>( \ + x, scale, y, n); \ + } while (0) + + PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAMB_VEC_SCALE_KERNEL_CASE); +#undef PD_LAMB_VEC_SCALE_KERNEL_CASE +} + template static void NCCLReduceScatterWithScale( const T *sendbuff, T *recvbuff, size_t recvcount, size_t nranks, @@ -725,10 +808,8 @@ static void NCCLReduceScatterWithScale( PADDLE_ENFORCE_EQ(nranks, 1, platform::errors::InvalidArgument( "nranks must be 1 when scale != nullptr.")); - auto numel = recvcount * nranks; - auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel); - ScaleCUDAKernel<<>>(sendbuff, scale, recvbuff, numel); + LaunchScaleKernel(dev_ctx, sendbuff, scale, recvbuff, recvcount * nranks, + stream); } return; } @@ -742,9 +823,7 @@ static void NCCLReduceScatterWithScale( if (scale && !should_destroy_op) { size_t numel = recvcount * nranks; T *new_sendbuff = buffer.Alloc(numel); - auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel); - ScaleCUDAKernel<<>>(sendbuff, scale, new_sendbuff, numel); + LaunchScaleKernel(dev_ctx, sendbuff, scale, new_sendbuff, numel, stream); sendbuff = new_sendbuff; } @@ -1005,15 +1084,16 @@ class DistributedFusedLambOpKernel "Too many parameter number. Only <= %d is supported.", std::numeric_limits::max())); - // Step 3: Get FusedIndices, ParamInfo - const auto *indices = GetInputTensorPtr(ctx, "FusedIndices"); + // Step 3: Get ParamInfo const auto *param_info_tensor = GetInputTensorPtr(ctx, "ParamInfo"); auto fp32_local_start_idx = param_info_tensor[0]; auto fp32_local_param_num = param_info_tensor[1]; auto fp32_global_param_num = param_info_tensor[2]; - auto fp16_local_start_idx = param_info_tensor[3]; - auto fp16_local_param_num = param_info_tensor[4]; - auto fp16_global_param_num = param_info_tensor[5]; + auto fp32_weight_decay_end_idx = param_info_tensor[3]; + auto fp16_local_start_idx = param_info_tensor[4]; + auto fp16_local_param_num = param_info_tensor[5]; + auto fp16_global_param_num = param_info_tensor[6]; + auto fp16_weight_decay_end_idx = param_info_tensor[7]; auto local_param_num = fp32_local_param_num + fp16_local_param_num; auto param_num = fp32_global_param_num + fp16_global_param_num; @@ -1031,7 +1111,7 @@ class DistributedFusedLambOpKernel << " , fp16_global_param_num = " << fp16_global_param_num; // Step 4: Get LearningRate, Moment1, Moment2, Beta1Pow, Beta2Pow, - // WeightDecay, GlobalScale, FoundInf + // GlobalScale, FoundInf const auto *global_scale = GetInputTensorPtr(ctx, "GlobalScale"); const auto *lr = GetInputTensorPtr(ctx, "LearningRate"); int64_t partial_numel = 0; @@ -1065,14 +1145,15 @@ class DistributedFusedLambOpKernel GetSameInOutTensorPtr(ctx, place, "Beta1Pow", "Beta1PowOut"); auto *beta2pow = GetSameInOutTensorPtr(ctx, place, "Beta2Pow", "Beta2PowOut"); - const float *weight_decay = GetInputTensorPtr(ctx, "WeightDecay"); auto *found_inf_t = ctx.Output("FoundInf"); found_inf_t->Resize({1}); auto *found_inf = found_inf_t->mutable_data(place); - // Step 5: Get attributes beta1, beta2, epsilon, max_grad_norm, ring_id, + // Step 5: Get attributes weight_decay, beta1, beta2, epsilon, + // max_grad_norm, ring_id, // use_master_param_norm, is_grad_scaled_by_nranks + auto weight_decay = ctx.Attr("weight_decay"); auto beta1 = ctx.Attr("beta1"); auto beta2 = ctx.Attr("beta2"); auto epsilon = ctx.Attr("epsilon"); @@ -1105,7 +1186,8 @@ class DistributedFusedLambOpKernel platform::float16 *fp16_sum_grad; auto fp32_numel_each_device = fp32_numel / num_devices; auto fp16_numel_each_device = fp16_numel / num_devices; - if (num_devices > 1) { + if (num_devices > 1 || + (max_global_grad_norm > 0 && !clip_after_allreduce)) { auto ptr = sum_grad_buffer.Alloc( fp32_numel_each_device * sizeof(float) + fp16_numel_each_device * sizeof(platform::float16)); @@ -1181,7 +1263,11 @@ class DistributedFusedLambOpKernel float, platform::float16><<<1, 1, 0, stream>>>( global_scale, max_global_grad_norm, fp32_square_grad_norm, fp32_scale, fp16_scale, clip_scale); - VLOG(1) << "Grad scale: " << FlattenToString(fp32_scale, 1, place); + if (fp32_scale) { + VLOG(1) << "Grad scale: " << FlattenToString(fp32_scale, 1, place); + } else { + VLOG(1) << "Grad scale: " << FlattenToString(fp16_scale, 1, place); + } if (num_devices > 1) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( fp32_square_grad_norm, fp32_square_grad_norm, 1, ncclFloat32, @@ -1218,36 +1304,56 @@ class DistributedFusedLambOpKernel VLOG(10) << "ReduceScatter done"; // Step 7: update the moment1, moment2. Calcuate the trust_ratio_div + auto *fused_offsets_t = ctx.Input("FusedParamOffsets"); + auto *fused_offsets = fused_offsets_t->data(); + auto *fp32_partial_fused_offsets_t = + ctx.Input("FP32ShardFusedParamOffsets"); + const auto *fp32_partial_fused_offsets = + fp32_partial_fused_offsets_t->data(); + auto *fp16_partial_fused_offsets_t = + ctx.Input("FP16ShardFusedParamOffsets"); + const auto *fp16_partial_fused_offsets = + fp16_partial_fused_offsets_t->data(); + + VLOG(1) << "FusedParamOffsets: " + << FlattenToString(fused_offsets, fused_offsets_t->numel(), + fused_offsets_t->place()); + VLOG(1) << "FP32ShardFusedParamOffsets: " + << FlattenToString(fp32_partial_fused_offsets, + fp32_partial_fused_offsets_t->numel(), + fp32_partial_fused_offsets_t->place()); + VLOG(1) << "FP16ShardFusedParamOffsets: " + << FlattenToString(fp16_partial_fused_offsets, + fp16_partial_fused_offsets_t->numel(), + fp16_partial_fused_offsets_t->place()); + memory::Buffer trust_ratio_div_buffer(place); auto *trust_ratio_div = trust_ratio_div_buffer.Alloc(partial_numel); auto fp32_offset = rank * fp32_numel_each_device; auto fp16_offset = rank * fp16_numel_each_device; if (has_fp32_param) { - auto config = - platform::GetGpuLaunchConfig1D(dev_ctx, fp32_numel_each_device); VLOG(10) << "Update FP32 Moment and TrustRatioDiv starts"; - UpdateLambMoment<<>>( + MultiTensorUpdateLambMomentAndTrustRatioDiv( + dev_ctx, fp32_partial_fused_offsets, fp32_local_param_num, fp32_param + fp32_offset, fp32_sum_grad, fp32_square_grad_norm, - global_scale, indices + fp32_offset, weight_decay, beta1pow, beta2pow, - moment1, moment2, trust_ratio_div, beta1, beta2, epsilon, - max_global_grad_norm, fp32_numel_each_device, rescale_grad); + global_scale, beta1pow, beta2pow, moment1, moment2, trust_ratio_div, + found_inf, weight_decay, fp32_weight_decay_end_idx, beta1, beta2, + epsilon, max_global_grad_norm, rescale_grad); VLOG(10) << "Update FP32 Moment and TrustRatioDiv done"; } float *master_param = nullptr; if (has_fp16_param) { master_param = fp32_param + fp32_numel; - auto config = - platform::GetGpuLaunchConfig1D(dev_ctx, fp16_numel_each_device); VLOG(10) << "Update FP16 Moment and TrustRatioDiv starts"; - UpdateLambMoment<<>>( + auto tmp_found_inf = has_fp32_param ? nullptr : found_inf; + MultiTensorUpdateLambMomentAndTrustRatioDiv( + dev_ctx, fp16_partial_fused_offsets, fp16_local_param_num, master_param + fp16_offset, fp16_sum_grad, fp32_square_grad_norm, - global_scale, indices + fp32_numel + fp16_offset, weight_decay, - beta1pow, beta2pow, moment1 + fp32_numel_each_device, + global_scale, beta1pow, beta2pow, moment1 + fp32_numel_each_device, moment2 + fp32_numel_each_device, - trust_ratio_div + fp32_numel_each_device, beta1, beta2, epsilon, - max_global_grad_norm, fp16_numel_each_device, rescale_grad); + trust_ratio_div + fp32_numel_each_device, tmp_found_inf, weight_decay, + fp16_weight_decay_end_idx, beta1, beta2, epsilon, + max_global_grad_norm, rescale_grad); VLOG(10) << "Update FP16 Moment and TrustRatioDiv done"; } @@ -1257,30 +1363,6 @@ class DistributedFusedLambOpKernel memory::Buffer square_norm_buffer(place); auto *param_square_norm = square_norm_buffer.Alloc(2 * param_num); auto *trust_ratio_div_square_norm = param_square_norm + param_num; - - auto *fused_offsets_t = ctx.Input("FusedParamOffsets"); - auto *fused_offsets = fused_offsets_t->data(); - auto *fp32_partial_fused_offsets_t = - ctx.Input("FP32ShardFusedParamOffsets"); - const auto *fp32_partial_fused_offsets = - fp32_partial_fused_offsets_t->data(); - auto *fp16_partial_fused_offsets_t = - ctx.Input("FP16ShardFusedParamOffsets"); - const auto *fp16_partial_fused_offsets = - fp16_partial_fused_offsets_t->data(); - - VLOG(1) << "FusedParamOffsets: " - << FlattenToString(fused_offsets, fused_offsets_t->numel(), - fused_offsets_t->place()); - VLOG(1) << "FP32ShardFusedParamOffsets: " - << FlattenToString(fp32_partial_fused_offsets, - fp32_partial_fused_offsets_t->numel(), - fp32_partial_fused_offsets_t->place()); - VLOG(1) << "FP16ShardFusedParamOffsets: " - << FlattenToString(fp16_partial_fused_offsets, - fp16_partial_fused_offsets_t->numel(), - fp16_partial_fused_offsets_t->place()); - if (num_devices > 1) { if (use_master_param_norm) { FillZeroWithPtr(param_square_norm + fp32_global_param_num, @@ -1296,11 +1378,11 @@ class DistributedFusedLambOpKernel fp16_partial_fused_offsets, fp16_local_param_num, param_square_norm + fp16_local_start_idx); } else { - // NOTE: extra computation is performed. We can improve this performance - // if needed in the future. MultiTensorL2Norm( - place, stream, fp16_param, fused_offsets + fp32_global_param_num, - fp16_global_param_num, param_square_norm + fp32_global_param_num); + place, stream, fp16_param + fused_offsets[fp16_local_start_idx] - + fused_offsets[fp32_global_param_num], + fused_offsets + fp16_local_start_idx, fp16_local_param_num, + param_square_norm + fp16_local_start_idx); } MultiTensorL2Norm(place, stream, trust_ratio_div, @@ -1333,26 +1415,29 @@ class DistributedFusedLambOpKernel // Step 9: update parameter, beta1pow, beta2pow. All gather parameters. if (has_fp32_param) { - LambUpdateParamAndBetaPows( - dev_ctx, trust_ratio_div, lr, indices + fp32_offset, - param_square_norm, trust_ratio_div_square_norm, fp32_square_grad_norm, - &beta1pow, &beta2pow, &found_inf, beta1, beta2, - fp32_numel_each_device, fp32_param + fp32_offset, nullptr, stream); + MultiTensorUpdateLambParamAndBetaPows( + dev_ctx, fp32_partial_fused_offsets, fp32_local_param_num, + trust_ratio_div, lr, param_square_norm + fp32_local_start_idx, + trust_ratio_div_square_norm + fp32_local_start_idx, found_inf, + fp32_param + fp32_offset, nullptr, beta1pow, beta2pow, beta1, beta2); if (num_devices > 1) { // ncclAllGather PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( fp32_param + fp32_offset, fp32_param, fp32_numel_each_device, ncclFloat32, comm, stream)); } + + beta1pow = nullptr; + beta2pow = nullptr; } if (has_fp16_param) { - LambUpdateParamAndBetaPows( - dev_ctx, trust_ratio_div + fp32_numel_each_device, lr, - indices + fp32_numel + fp16_offset, param_square_norm, - trust_ratio_div_square_norm, fp32_square_grad_norm, &beta1pow, - &beta2pow, &found_inf, beta1, beta2, fp16_numel_each_device, - fp16_param + fp16_offset, master_param + fp16_offset, stream); - + MultiTensorUpdateLambParamAndBetaPows( + dev_ctx, fp16_partial_fused_offsets, fp16_local_param_num, + trust_ratio_div + fp32_numel_each_device, lr, + param_square_norm + fp16_local_start_idx, + trust_ratio_div_square_norm + fp16_local_start_idx, found_inf, + fp16_param + fp16_offset, master_param + fp16_offset, beta1pow, + beta2pow, beta1, beta2); if (num_devices > 1) { // ncclAllGather PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index df5da1b79535cc6f5e4a638e9d32c367ea7cdb9f..fe5cd066864b82c734614e33869dff1734bee6d0 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -88,8 +88,8 @@ __device__ inline void VectorizeLarsUpdate( T* param_out, MT* velocity_out, const MT mu, MT local_lr, const MT lars_weight_decay, const MT rescale_grad, const int tid, const int grid_stride, const int numel, MT* master_param_out = nullptr) { - using VecType = paddle::platform::AlignedVector; - using VecMType = paddle::platform::AlignedVector; + using VecType = phi::AlignedVector; + using VecMType = phi::AlignedVector; int main = numel >> (VecSize >> 1); int tail_offset = main * VecSize; diff --git a/paddle/fluid/operators/optimizers/multi_tensor_apply.h b/paddle/fluid/operators/optimizers/multi_tensor_apply.h index 5d8d03c733dae210e8a41a8ad78a258df558b341..179e8f452545c437e373e42d59d18f524f260cd5 100644 --- a/paddle/fluid/operators/optimizers/multi_tensor_apply.h +++ b/paddle/fluid/operators/optimizers/multi_tensor_apply.h @@ -94,11 +94,40 @@ static __global__ void MultiTensorApplyCUDAKernel( args...); } -template -static void MultiTensorApply(Functor functor, gpuStream_t stream, - const int *offsets, int n, int chunk_size, - Args... args) { +template +class MultiTensorLauncher { + public: + MultiTensorLauncher( + const TensorMetaList &meta, + const int &chunk_id, const int &chunk_size, const int &block_dim, + const gpuStream_t &stream) + : meta_(meta), + chunk_id_(chunk_id), + chunk_size_(chunk_size), + block_dim_(block_dim), + stream_(stream) {} + + template + void Launch(Functor &&functor, Args &&... args) const { + MultiTensorApplyCUDAKernel< + Functor, MaxTensorNumPerLaunch, + MaxChunkNumPerLaunch><<>>( + functor, meta_, chunk_size_, args...); + } + + private: + const TensorMetaList &meta_; + const int &chunk_id_; + const int &chunk_size_; + const int &block_dim_; + const gpuStream_t &stream_; +}; + +template +static void MultiTensorApplyWithCallback(gpuStream_t stream, const int *offsets, + int n, int chunk_size, int block_dim, + Callback &&callback) { if (n == 0) return; constexpr auto NumTensor = MaxTensorNumPerLaunch; @@ -110,6 +139,11 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream, int numel_offset = 0; metas.start_tensor_id = 0; metas.start_chunk_id = 0; + int launch_num = 0; + + MultiTensorLauncher launcher( + metas, chunk_id, chunk_size, block_dim, stream); + for (int i = 0; i < n; ++i) { auto length = offsets[i + 1] - offsets[i]; if (tensor_id == 0) { @@ -132,9 +166,8 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream, bool last_chunk = (i + 1 == n && j + 1 == chunk_num); if (tensor_full || block_full || last_chunk) { - MultiTensorApplyCUDAKernel<<>>( - functor, metas, chunk_size, args...); + callback(launcher, launch_num); + ++launch_num; chunk_id = 0; if (j + 1 == chunk_num) { // chunk for the current tensor is full metas.start_chunk_id = 0; @@ -152,5 +185,17 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream, } } +template +static void MultiTensorApply(Functor functor, gpuStream_t stream, + const int *offsets, int n, int chunk_size, + int block_dim, Args &&... args) { + auto callback = [&](const MultiTensorLauncher &launcher, + int i) { launcher.Launch(functor, args...); }; + MultiTensorApplyWithCallback( + stream, offsets, n, chunk_size, block_dim, callback); +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc index 529d60a2820ea92de0b0009b31c9f2ad04d4860a..0e3f895d276af6856c64ddd123606b087689ca9a 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cc +++ b/paddle/fluid/operators/optimizers/sgd_op.cc @@ -166,8 +166,3 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker, ops::SGDOpInferVarType); -REGISTER_OP_CPU_KERNEL( - sgd, ops::SGDOpKernel, - ops::SGDOpKernel, - ops::SGDOpKernel); diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu index 3149f5f56ed4964a750f61a354c6cd31a29fc526..222244a2fd1e34ace573ad4fa06775c0e5113925 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cu +++ b/paddle/fluid/operators/optimizers/sgd_op.cu @@ -166,10 +166,3 @@ class SGDOpKernel }; } // namespace operators } // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - sgd, ops::SGDOpKernel, - ops::SGDOpKernel, - ops::SGDOpKernel); diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu index f2cb427a0a5b139e1ccdf960afeb6db4bcb8b5a5..d0b78b9b0643d6c5dc5b4bfeac2cf792ac349194 100644 --- a/paddle/fluid/operators/p_norm_op.cu +++ b/paddle/fluid/operators/p_norm_op.cu @@ -39,6 +39,11 @@ __device__ __forceinline__ int sgn(T val) { __device__ __forceinline__ platform::float16 inline_abs(platform::float16 x) { return static_cast(abs(static_cast(x))); } + +__device__ __forceinline__ platform::bfloat16 inline_abs(platform::bfloat16 x) { + return static_cast(abs(static_cast(x))); +} + __device__ __forceinline__ float inline_abs(float x) { return abs(x); } __device__ __forceinline__ double inline_abs(double x) { return abs(x); } @@ -53,6 +58,11 @@ __device__ __forceinline__ platform::float16 inline_pow( return static_cast( pow(static_cast(base), static_cast(exponent))); } +__device__ __forceinline__ platform::bfloat16 inline_pow( + platform::bfloat16 base, platform::bfloat16 exponent) { + return static_cast( + pow(static_cast(base), static_cast(exponent))); +} __device__ __forceinline__ float inline_pow(float base, float exponent) { return pow(base, exponent); } @@ -202,9 +212,11 @@ using CUDA = paddle::platform::CUDADeviceContext; REGISTER_OP_CUDA_KERNEL(p_norm, ops::PnormCUDAKernel, + ops::PnormCUDAKernel, ops::PnormCUDAKernel, ops::PnormCUDAKernel); REGISTER_OP_CUDA_KERNEL( p_norm_grad, ops::PnormGradCUDAKernel, + ops::PnormGradCUDAKernel, ops::PnormGradCUDAKernel, ops::PnormGradCUDAKernel); diff --git a/paddle/fluid/operators/pad_constant_like_op.h b/paddle/fluid/operators/pad_constant_like_op.h index 5df167fdf726345074cdc40afd0c5b394467578f..0aedd800e1a237d4baf0092eef9bac9f7dbe862d 100644 --- a/paddle/fluid/operators/pad_constant_like_op.h +++ b/paddle/fluid/operators/pad_constant_like_op.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/math/padding.h" +#include "paddle/phi/kernels/funcs/padding.h" namespace paddle { namespace operators { @@ -50,8 +50,9 @@ class PadConstantLikeKernel : public framework::OpKernel { pads[j * 2 + 1] = static_cast(in_x->dims()[j] - in_y->dims()[j]); } - math::PaddingFunctor(rank, context, pads, pad_value, - *in_y, out); + phi::funcs::PaddingFunctor( + rank, context.template device_context(), pads, pad_value, + *in_y, out); } }; @@ -82,8 +83,9 @@ class PadConstantLikeGradKernel : public framework::OpKernel { pads[j * 2 + 1] = static_cast(in_dout->dims()[j] - in_y->dims()[j]); } - math::PaddingGradFunctor(rank, context, pads, *in_dout, - d_y); + phi::funcs::PaddingGradFunctor( + rank, context.template device_context(), pads, *in_dout, + d_y); } }; diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc index 39acba7e58aba51942d7d8de2d89e2783fd591f9..dc162ae5782f2690fcf6378603268369e4aeb9ca 100644 --- a/paddle/fluid/operators/pad_op.cc +++ b/paddle/fluid/operators/pad_op.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pad_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/complex.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -28,37 +30,6 @@ class PadOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad"); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Pad"); - - auto x_dim = ctx->GetInputDim("X"); - auto& paddings = ctx->Attrs().Get>("paddings"); - PADDLE_ENFORCE_EQ( - static_cast(paddings.size()), x_dim.size() * 2, - platform::errors::InvalidArgument( - "Size of 'paddings' dimension should be equal to 2 * size of " - "Input(X)'s dimension, but received (size of 'paddings' dimension " - "is) %d vs (2 * size of Input(X)'s dimension is) %d.", - static_cast(paddings.size()), x_dim.size() * 2)); - for (size_t i = 0; i < paddings.size(); ++i) { - PADDLE_ENFORCE_GE(paddings[i], 0, - platform::errors::InvalidArgument( - "The element of 'paddings' should >= 0, but " - "received %d for index %d.", - paddings[i], static_cast(i))); - } - std::vector out_dims(x_dim.size()); - for (int i = 0; i < x_dim.size(); ++i) { - if ((!ctx->IsRuntime()) && (x_dim[i] == -1)) { - out_dims[i] = -1; - } else { - out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1]; - } - } - ctx->SetOutputDim("Out", phi::make_ddim(out_dims)); - if (out_dims[0] == x_dim[0]) { - // Only pass LoD when the first dimension is equal between - // output and input. - ctx->ShareLoD("X", /*->*/ "Out"); - } } }; @@ -160,47 +131,13 @@ class PadOpDoubleGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(pad, PadInferShapeFunctor, + PD_INFER_META(phi::PadInferMeta)); REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ops::PadOpGradMaker, - ops::PadOpGradMaker); + ops::PadOpGradMaker, + PadInferShapeFunctor); REGISTER_OPERATOR(pad_grad, ops::PadOpGrad, ops::PadOpDoubleGradMaker, ops::PadOpDoubleGradMaker); -REGISTER_OP_CPU_KERNEL( - pad, ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel>, - ops::PadKernel>); -REGISTER_OP_CPU_KERNEL( - pad_grad, ops::PadGradKernel, - ops::PadGradKernel, - ops::PadGradKernel>, - ops::PadGradKernel>); - -REGISTER_OP_CUDA_KERNEL( - pad, ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel>, - ops::PadKernel>); -REGISTER_OP_CUDA_KERNEL( - pad_grad, ops::PadGradKernel, - ops::PadGradKernel, - ops::PadGradKernel, - ops::PadGradKernel>, - ops::PadGradKernel>); diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h deleted file mode 100644 index d494c954e1ef73b585761acf7490a5e35beccac4..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/pad_op.h +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/padding.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class PadKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto pads = context.Attr>("paddings"); - float pad_value = context.Attr("pad_value"); - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - - int rank = x->dims().size(); - math::PaddingFunctor(rank, context, pads, - static_cast(pad_value), *x, out); - } -}; - -template -class PadGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto pads = context.Attr>("paddings"); - auto* d_out = context.Input(framework::GradVarName("Out")); - auto* d_x = context.Output(framework::GradVarName("X")); - if (d_x == nullptr) { - return; - } - - d_x->mutable_data(context.GetPlace()); - int rank = d_out->dims().size(); - math::PaddingGradFunctor(rank, context, pads, *d_out, - d_x); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc index 2a127d9ad1db0c1e169fdd1e20a1568b99d228a0..21ca26f49f653d03e2710937d360091e0c4536df 100644 --- a/paddle/fluid/operators/pixel_shuffle_op.cc +++ b/paddle/fluid/operators/pixel_shuffle_op.cc @@ -124,8 +124,8 @@ class PixelShuffleGradOp : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(pixel_shuffle, PixelShuffleInferShapeFunctor, - PT_INFER_META(phi::PixelShuffleInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(pixel_shuffle, PixelShuffleInferShapeFunctor, + PD_INFER_META(phi::PixelShuffleInferMeta)); REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker, ops::PixelShuffleGradMaker, diff --git a/paddle/fluid/operators/poisson_op.cc b/paddle/fluid/operators/poisson_op.cc index 0cecbf0b9cb027f7032b7b20fb10ef06a79503df..d5896c4105932ef7327d7093a15cf50e87308ae5 100644 --- a/paddle/fluid/operators/poisson_op.cc +++ b/paddle/fluid/operators/poisson_op.cc @@ -87,8 +87,8 @@ class PoissonGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; namespace plat = paddle::platform; -DELCARE_INFER_SHAPE_FUNCTOR(poisson, PoissonInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(poisson, PoissonInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(poisson, ops::PoissonOp, ops::PoissonOpMaker, ops::PoissonOpInferVarType, diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc index e0c24935b47509dbe473a963240f4234e168a293..d061f9ae05613491cbdbff3793b57a3d89d7d6e5 100644 --- a/paddle/fluid/operators/pool_with_index_op.cc +++ b/paddle/fluid/operators/pool_with_index_op.cc @@ -81,8 +81,12 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { output_shape.insert(output_shape.end(), ksize.begin(), ksize.end()); } else { for (size_t i = 0; i < ksize.size(); ++i) { - output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i], - paddings[i], strides[i])); + if ((!ctx->IsRuntime()) && (in_x_dims[i + 2] < 0)) { + output_shape.push_back(in_x_dims[i + 2]); + } else { + output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i], + paddings[i], strides[i])); + } } } ctx->SetOutputDim("Out", phi::make_ddim(output_shape)); diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc index da637dfeb237dd4f17816e784882720dc2f2ff64..cfacffff234105ac9c6dc41b86f06594d319dcbb 100644 --- a/paddle/fluid/operators/psroi_pool_op.cc +++ b/paddle/fluid/operators/psroi_pool_op.cc @@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/psroi_pool_op.h" -#include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -82,75 +82,6 @@ class PSROIPoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of PSROIPoolOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true, - platform::errors::InvalidArgument( - "Input(ROIs) of PSROIPoolOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of PSROIPoolOp should not be null.")); - auto input_dims = ctx->GetInputDim("X"); - auto rois_dims = ctx->GetInputDim("ROIs"); - - PADDLE_ENFORCE_EQ(input_dims.size(), 4, - platform::errors::InvalidArgument( - "The format of input tensor is NCHW")); - PADDLE_ENFORCE_EQ( - rois_dims.size(), 2, - platform::errors::InvalidArgument( - "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " - "given as [(x1, y1, x2, y2), ...]")); - PADDLE_ENFORCE_EQ( - rois_dims[1], 4, - platform::errors::InvalidArgument( - "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " - "given as [(x1, y1, x2, y2), ...]")); - if (ctx->HasInput("RoisNum")) { - auto rois_num_dims = ctx->GetInputDim("RoisNum"); - PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1, - platform::errors::InvalidArgument( - "The second dimension of RoisNum should " - "be 1, but received dimension is %d", - rois_num_dims.size())); - } - int pooled_height = ctx->Attrs().Get("pooled_height"); - int pooled_width = ctx->Attrs().Get("pooled_width"); - int output_channels = ctx->Attrs().Get("output_channels"); - float spatial_scale = ctx->Attrs().Get("spatial_scale"); - - PADDLE_ENFORCE_EQ( - input_dims[1], output_channels * pooled_height * pooled_width, - platform::errors::InvalidArgument( - "the channel of X(%d) " - "should be equal to the product of " - "output_channels(%d), pooled_height(%d) and pooled_width(%d)", - input_dims[1], output_channels, pooled_height, pooled_width)); - - PADDLE_ENFORCE_GT(pooled_height, 0, - platform::errors::InvalidArgument( - "The pooled output height must be greater than 0")); - PADDLE_ENFORCE_GT(pooled_width, 0, - platform::errors::InvalidArgument( - "The pooled output width must be greater than 0")); - PADDLE_ENFORCE_GT(output_channels, 1, - platform::errors::InvalidArgument( - "The pooled output channels must greater than 1")); - PADDLE_ENFORCE_GT(spatial_scale, 0.0f, - platform::errors::InvalidArgument( - "The spatial scale must greater than 0.")); - - auto out_dims = input_dims; - out_dims[0] = rois_dims[0]; - out_dims[1] = - output_channels; // input_dims[1] / (pooled_height * pooled_width); - out_dims[2] = pooled_height; - out_dims[3] = pooled_width; - ctx->SetOutputDim("Out", out_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -164,16 +95,6 @@ class PSROIPoolGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, - platform::errors::InvalidArgument( - "The gradient of Out should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true, - platform::errors::InvalidArgument( - "The gradient of X should not be null.")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -204,15 +125,13 @@ class PSROIPoolGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(psroi_pool, PsroiPoolInferShapeFunctor, + PD_INFER_META(phi::PsroiPoolInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(psroi_pool_grad, PsroiPoolGradInferShapeFunctor, + PD_INFER_META(phi::PsroiPoolGradInferMeta)); REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker, ops::PSROIPoolGradMaker, - ops::PSROIPoolGradMaker); -REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp); -REGISTER_OP_CPU_KERNEL( - psroi_pool, - ops::CPUPSROIPoolOpKernel, - ops::CPUPSROIPoolOpKernel); -REGISTER_OP_CPU_KERNEL( - psroi_pool_grad, - ops::CPUPSROIPoolGradOpKernel, - ops::CPUPSROIPoolGradOpKernel); + ops::PSROIPoolGradMaker, + PsroiPoolInferShapeFunctor); +REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp, + PsroiPoolGradInferShapeFunctor); diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu deleted file mode 100644 index c1917501db8b5afebf4b7951b0f04de69758b49d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/psroi_pool_op.cu +++ /dev/null @@ -1,350 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/psroi_pool_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -static constexpr int kNumCUDAThreads = 512; -static constexpr int kNumMaximumNumBlocks = 4096; - -static inline int NumBlocks(const int N) { - return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, - kNumMaximumNumBlocks); -} - -template -__global__ void GPUPSROIPoolForward( - const int nthreads, const T* input_data, const T* input_rois, - const float spatial_scale, const int input_channels, const int height, - const int width, const int output_channels, const int pooled_height, - const int pooled_width, const int* rois_batch_id_data, T* output_data) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; - for (size_t i = index; i < nthreads; i += offset) { - // The output is in order (n, c, ph, pw) - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % output_channels; - int n = i / pooled_width / pooled_height / output_channels; - - // set roi_batch_id - int roi_batch_id = rois_batch_id_data[n]; - - // [start, end) interval for spatial sampling - const T* offset_input_rois = input_rois + n * 4; - T roi_start_w = static_cast(round(offset_input_rois[0])) * spatial_scale; - T roi_start_h = static_cast(round(offset_input_rois[1])) * spatial_scale; - T roi_end_w = - static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; - T roi_end_h = - static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - - // Force too small ROIs to be 1x1 - T roi_height = max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 - T roi_width = max(roi_end_w - roi_start_w, (T)0.1); - - // Compute w and h at input feature map - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); - int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); - int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); - int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); - - // Add roi offsets and clip to input boundaries - hstart = min(max(hstart, 0), height); - hend = min(max(hend, 0), height); - wstart = min(max(wstart, 0), width); - wend = min(max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - int input_channel = (c * pooled_height + ph) * pooled_width + pw; - const T* offset_input_data = - input_data + - (roi_batch_id * input_channels + input_channel) * height * width; - T outsum = 0; - - for (int ih = hstart; ih < hend; ++ih) { - for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * width + iw; - outsum += offset_input_data[input_index]; - } - } - - T bin_area = static_cast((hend - hstart) * (wend - wstart)); - output_data[i] = is_empty ? 0. : outsum / bin_area; - } -} - -template -__global__ void GPUPSROIPoolBackward( - const int nthreads, const T* input_rois, const T* output_grad_data, - const float spatial_scale, const int input_channels, const int height, - const int width, const int output_channels, const int pooled_height, - const int pooled_width, const int* rois_batch_id_data, T* input_grad_data) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; - for (int i = index; i < nthreads; i += offset) { - // The output is in order (n, c, ph, pw) - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % output_channels; - int n = i / pooled_width / pooled_height / output_channels; - - // set roi_batch_id - int roi_batch_id = rois_batch_id_data[n]; - int input_channel = (c * pooled_height + ph) * pooled_width + pw; - int input_offset = - (roi_batch_id * input_channels + input_channel) * height * width; - T* offset_input_grad_data = input_grad_data + input_offset; - - // [start, end) interval for spatial sampling - const T* offset_input_rois = input_rois + n * 4; - T roi_start_w = static_cast(round(offset_input_rois[0])) * spatial_scale; - T roi_start_h = static_cast(round(offset_input_rois[1])) * spatial_scale; - T roi_end_w = - static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; - T roi_end_h = - static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - - // Force too small ROIs to be 1x1 - T roi_height = max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 - T roi_width = max(roi_end_w - roi_start_w, (T)0.1); - - // Compute w and h at input feature map - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); - int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); - int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); - int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); - - // Add roi offsets and clip to input boundaries - hstart = min(max(hstart, 0), height); - hend = min(max(hend, 0), height); - wstart = min(max(wstart, 0), width); - wend = min(max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - // Accumulate diff_val into input data - T bin_area = static_cast((hend - hstart) * (wend - wstart)); - T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area; - for (int ih = hstart; ih < hend; ++ih) { - for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * width + iw; - platform::CudaAtomicAdd(offset_input_grad_data + input_index, diff_val); - } - } - } -} - -template -class GPUPSROIPoolOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto output_channels = ctx.Attr("output_channels"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int input_channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - - PADDLE_ENFORCE_EQ( - input_channels, output_channels * pooled_height * pooled_width, - platform::errors::InvalidArgument( - "The channels %d of input X should equal the product of " - "output_channels %d x pooled_height %d x pooled_width %d.", - input_channels, output_channels, pooled_height, pooled_width)); - - int rois_num = rois->dims()[0]; - if (rois_num == 0) return; - int rois_batch_size; - framework::Tensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - int* rois_batch_id_data = - rois_batch_id_list.mutable_data(platform::CPUPlace()); - - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - auto* rois_num_data = rois_num_t->data(); - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of input(ROIs) and input(X) must be " - "the same but received batch size of input(ROIs) and " - "input(X) is %d and %d respectively.", - rois_batch_size, batch_size)); - std::vector rois_num_list(rois_batch_size); - memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(), - rois_num_data, sizeof(int) * rois_batch_size, 0); - int rois_num_count = 0; - for (int i = 0; i < rois_batch_size; ++i) { - rois_num_count += rois_num_list[i]; - } - PADDLE_ENFORCE_EQ( - rois_num_count, rois_num, - platform::errors::InvalidArgument( - "the rois_num from input and RoisNum must be the same")); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_list[n]; ++i) { - rois_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of input(ROIs) and input(X) must be " - "the same but received batch size of input(ROIs) and " - "input(X) is %d and %d respectively.", - rois_batch_size, batch_size)); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, - platform::errors::InvalidArgument( - "The number of rois from input(ROIs) and its LOD " - "must be the same. Received rois %d of input(ROIs) " - "but the number of rois %d from its LOD is %d", - rois_num, rois_num_with_lod)); - - // set rois batch id - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - } - framework::Tensor rois_batch_id_list_gpu; - framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), - ctx.device_context(), &rois_batch_id_list_gpu); - - int output_size = out->numel(); - int blocks = NumBlocks(output_size); - int threads = kNumCUDAThreads; - - // call cuda kernel function - GPUPSROIPoolForward< - T><<>>( - output_size, in->data(), rois->data(), spatial_scale, - input_channels, height, width, output_channels, pooled_height, - pooled_width, rois_batch_id_list_gpu.data(), - out->mutable_data(ctx.GetPlace())); - } -}; - -template -class GPUPSROIPoolGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - - auto* output_grad = ctx.Input(framework::GradVarName("Out")); - auto* input_grad = ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto output_channels = ctx.Attr("output_channels"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - int rois_num = rois->dims()[0]; - int input_channels = in->dims()[1]; - int height = in->dims()[2]; - int width = in->dims()[3]; - - if (input_grad) { - // set roi batch id - framework::Tensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - int* rois_batch_id_data = - rois_batch_id_list.mutable_data(platform::CPUPlace()); - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - std::vector rois_num_list(rois_batch_size); - memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(), - rois_num_t->data(), sizeof(int) * rois_batch_size, 0); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_list[n]; ++i) { - rois_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - } - framework::Tensor rois_batch_id_list_gpu; - framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), - ctx.device_context(), &rois_batch_id_list_gpu); - - input_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(ctx.cuda_device_context(), input_grad, static_cast(0)); - - int output_grad_size = output_grad->numel(); - int blocks = NumBlocks(output_grad_size); - int threads = kNumCUDAThreads; - - if (output_grad_size > 0) { - GPUPSROIPoolBackward< - T><<>>( - output_grad_size, rois->data(), output_grad->data(), - spatial_scale, input_channels, height, width, output_channels, - pooled_height, pooled_width, rois_batch_id_list_gpu.data(), - input_grad->mutable_data(ctx.GetPlace())); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - psroi_pool, - ops::GPUPSROIPoolOpKernel, - ops::GPUPSROIPoolOpKernel); -REGISTER_OP_CUDA_KERNEL( - psroi_pool_grad, - ops::GPUPSROIPoolGradOpKernel, - ops::GPUPSROIPoolGradOpKernel); diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h deleted file mode 100644 index 3f020d93391b0e648898c1b83858a7bd9809aa03..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/psroi_pool_op.h +++ /dev/null @@ -1,295 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class CPUPSROIPoolOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto output_channels = ctx.Attr("output_channels"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int input_channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - PADDLE_ENFORCE_EQ(input_channels, - output_channels * pooled_height * pooled_width, - platform::errors::InvalidArgument( - "the channels of input " - "X should equal the product of " - "output_channels x pooled_height x pooled_width")); - - auto in_stride = phi::stride(in_dims); - auto out_stride = phi::stride(out->dims()); - - const T* input_data = in->data(); - - framework::Tensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - int* rois_batch_id_data = - rois_batch_id_list.mutable_data(ctx.GetPlace()); - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - auto* rois_num_data = rois_num_t->data(); - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of rois and the batch size of images " - " must be the same. But received the batch size of rois is %d, " - "and the batch size of images is %d", - rois_batch_size, batch_size)); - int rois_num_count = 0; - for (int i = 0; i < rois_batch_size; ++i) { - rois_num_count += rois_num_data[i]; - } - PADDLE_ENFORCE_EQ( - rois_num_count, rois_num, - platform::errors::InvalidArgument( - "the rois_num from input and RoisNum must be the same")); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - rois_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument("the rois_batch_size and input(X) " - "batch_size should be the same.")); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ( - rois_num_with_lod, rois_num, - platform::errors::InvalidArgument( - "the rois_num from input and lod must be the same")); - // calculate batch id index for each roi according to LoD - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - } - T* output_data = out->mutable_data(ctx.GetPlace()); - const T* input_rois = rois->data(); - - // calculate psroipooling, parallel processing can be implemented per ROI - for (int n = 0; n < rois_num; ++n) { - // set roi batch id - int roi_batch_id = rois_batch_id_data[n]; - - // [start, end) interval for spatial sampling - const T* offset_input_rois = input_rois + n * 4; - T roi_start_w = - static_cast(round(offset_input_rois[0])) * spatial_scale; - T roi_start_h = - static_cast(round(offset_input_rois[1])) * spatial_scale; - T roi_end_w = - static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; - T roi_end_h = - static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - // Force too small rois to be 1 x 1 - T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 - T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1); - - // Compute bin size w and h at input feature map - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - // calculate each pixel of the output feature map. - int out_roi_offset = n * out_stride[0]; - for (int c = 0; c < output_channels; ++c) { - // per category - int out_plane_offset = out_roi_offset + c * out_stride[1]; - for (int ph = 0; ph < pooled_height; ++ph) { - int out_row_offset = out_plane_offset + ph * out_stride[2]; - for (int pw = 0; pw < pooled_width; ++pw) { - // calculate w and h at input feature map - int hstart = floor(static_cast(ph) * bin_size_h + roi_start_h); - int wstart = floor(static_cast(pw) * bin_size_w + roi_start_w); - int hend = ceil(static_cast(ph + 1) * bin_size_h + roi_start_h); - int wend = ceil(static_cast(pw + 1) * bin_size_w + roi_start_w); - // Add roi offsets and clip to input boundaries - hstart = std::min(std::max(hstart, 0), height); - wstart = std::min(std::max(wstart, 0), width); - hend = std::min(std::max(hend, 0), height); - wend = std::min(std::max(wend, 0), width); - - int output_index = out_row_offset + pw; - int input_channel = (c * pooled_height + ph) * pooled_width + pw; - int input_plane_offset = - roi_batch_id * in_stride[0] + input_channel * in_stride[1]; - const T* offset_input_data = input_data + input_plane_offset; - T out_sum = 0.; - bool is_empty = (hend <= hstart) || (wend <= wstart); - for (int ih = hstart; ih < hend; ++ih) { - for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * in_stride[2] + iw; - out_sum += offset_input_data[input_index]; - } - } - T bin_area = (hend - hstart) * (wend - wstart); - output_data[output_index] = is_empty ? 0. : out_sum / bin_area; - } - } - } - } - return; - } -}; - -template -class CPUPSROIPoolGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* output_grad = - ctx.Input(framework::GradVarName("Out")); - auto* input_grad = - ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto output_channels = ctx.Attr("output_channels"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - if (input_grad) { - auto in_dims = in->dims(); - int input_channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - // set roi batch id - framework::Tensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - int* rois_batch_id_data = - rois_batch_id_list.mutable_data(ctx.GetPlace()); - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - auto* rois_num_data = rois_num_t->data(); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - rois_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - // calculate batch id index for each roi according to LoD - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - } - const T* input_rois = rois->data(); - const T* output_grad_data = output_grad->data(); - T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); - - // set gradient of X to be 0. before backpropagate. - phi::funcs::SetConstant set_zero; - set_zero(ctx.template device_context(), input_grad, - static_cast(0)); - - // backpropagate gradient per output pixel - int output_grad_size = output_grad->numel(); - for (int i = 0; i < output_grad_size; ++i) { - // The output is in order (n, c, ph, pw) - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % output_channels; - int n = i / pooled_width / pooled_height / output_channels; - - // set roi_batch_id - int roi_batch_id = rois_batch_id_data[n]; - int input_channel = (c * pooled_height + ph) * pooled_width + pw; - int input_offset = - (roi_batch_id * input_channels + input_channel) * height * width; - T* offset_input_grad_data = input_grad_data + input_offset; - - // [start, end) interval for spatial sampling - const T* offset_input_rois = input_rois + n * 4; - T roi_start_w = - static_cast(round(offset_input_rois[0])) * spatial_scale; - T roi_start_h = - static_cast(round(offset_input_rois[1])) * spatial_scale; - T roi_end_w = - static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; - T roi_end_h = - static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - - // Force too small ROIs to be 1x1 - T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 - T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1); - - // Compute w and h at input feature map - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); - int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); - int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); - int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); - - // Add roi offsets and clip to input boundaries - hstart = std::min(std::max(hstart, 0), height); - hend = std::min(std::max(hend, 0), height); - wstart = std::min(std::max(wstart, 0), width); - wend = std::min(std::max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - // Accumulate diff_val into input data - T bin_area = static_cast((hend - hstart) * (wend - wstart)); - T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area; - for (int ih = hstart; ih < hend; ++ih) { - for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * width + iw; - offset_input_grad_data[input_index] += diff_val; - } - } - } - } - return; - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/put_along_axis_op.cc b/paddle/fluid/operators/put_along_axis_op.cc index 6b0d6f332bcae8890cdfaccb1244886daa63ae42..54e31845ad4bd5ddfa81bc90a10391f027dffc11 100644 --- a/paddle/fluid/operators/put_along_axis_op.cc +++ b/paddle/fluid/operators/put_along_axis_op.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/put_along_axis_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" @@ -123,16 +124,3 @@ REGISTER_OPERATOR(put_along_axis, ops::PutAlongAxisOp, ops::PutAlongAxisOpMaker, paddle::operators::PutAlongAxisInplaceInferer); REGISTER_OPERATOR(put_along_axis_grad, ops::PutAlongAxisGradOp); - -REGISTER_OP_CPU_KERNEL(put_along_axis, ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel); - -REGISTER_OP_CPU_KERNEL(put_along_axis_grad, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel); diff --git a/paddle/fluid/operators/put_along_axis_op.cu b/paddle/fluid/operators/put_along_axis_op.cu deleted file mode 100644 index 5508023efad2c60a00f5ea3a8d1b853c6e5ba1fb..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/put_along_axis_op.cu +++ /dev/null @@ -1,134 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/put_along_axis_op.h" -#include "paddle/phi/core/ddim.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class PutAlongAxisCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisCUDAKernel only runs on GPU device.")); - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto value = ctx.Input("Value"); - auto index = ctx.Input("Index"); - auto reduce_op = ctx.Attr("Reduce"); - auto result = ctx.Output("Result"); - const platform::DeviceContext &device_ctx = ctx.device_context(); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - - framework::TensorCopy(*input, ctx.GetPlace(), result); - if (reduce_op == "add") { - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "multiply" || reduce_op == "mul") { - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "assign") { - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "can not support reduce_op: '%s' for scatter kernel, only " - "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the " - "defalut reduce op is 'assign' ", - reduce_op)); - return; - } - } -}; - -template -class PutAlongAxisGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisGradOpCUDAKernel only runs on GPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto value_grad = ctx.Output(framework::GradVarName("Value")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (input_grad) { - framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad); - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_input_grad_kernel( - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } else { - gpu_scatter_input_grad_kernel( - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } - } - if (value_grad) { - value_grad->Resize(index->dims()); - value_grad->mutable_data(ctx.GetPlace()); - if (index_type == framework::proto::VarType::INT32) { - gpu_gather_kernel( - *result_grad, axis, *index, *value_grad, - ctx.device_context()); // the gradient of scatter is gather - } else if (index_type == framework::proto::VarType::INT64) { - gpu_gather_kernel(*result_grad, axis, *index, *value_grad, - ctx.device_context()); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(put_along_axis, ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel); -REGISTER_OP_CUDA_KERNEL(put_along_axis_grad, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel); diff --git a/paddle/fluid/operators/put_along_axis_op.h b/paddle/fluid/operators/put_along_axis_op.h deleted file mode 100644 index 38487f5ce28c9e35dd6e84403b88dbc0fdfa07b3..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/put_along_axis_op.h +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather_scatter_kernel.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class PutAlongAxisOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisOpKernel only runs on CPU.")); - - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto value = ctx.Input("Value"); - auto index = ctx.Input("Index"); - auto reduce_op = ctx.Attr("Reduce"); - auto result = ctx.Output("Result"); - - framework::TensorCopy(*input, ctx.GetPlace(), result); - const platform::DeviceContext &device_ctx = ctx.device_context(); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (reduce_op == "add") { - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "multiply" || reduce_op == "mul") { - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "assign") { - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "can not support reduce_op: '%s' for scatter kernel, only " - "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the " - "defalut reduce " - "op is 'assign' ", - reduce_op)); - return; - } - } -}; - -template -class PutAlongAxisGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisGradOpKernel only runs on CPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto value_grad = ctx.Output(framework::GradVarName("Value")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - - if (input_grad) { - framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad); - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_input_grad_kernel( - // Here passing an unused argument *result_grad, because it's - // convenient to instantiate a bunch of template function with the - // same arguments list. - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } else { - cpu_scatter_input_grad_kernel( - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } - } - - if (value_grad) { - value_grad->Resize(index->dims()); - value_grad->mutable_data(ctx.GetPlace()); - if (index_type == framework::proto::VarType::INT32) { - cpu_gather_kernel(*result_grad, axis, *index, *value_grad, - ctx.device_context()); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_gather_kernel(*result_grad, axis, *index, *value_grad, - ctx.device_context()); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu index 5e841a097fed76d0ee5582c40ce417e24fb4a739..a57a8d5cf8b7f65a892ce9465ce03bd3c9519f1c 100644 --- a/paddle/fluid/operators/qr_op.cu +++ b/paddle/fluid/operators/qr_op.cu @@ -56,13 +56,13 @@ class QrGPUKernel : public framework::OpKernel { int tau_stride = min_mn; if (compute_q) { - q.mutable_data>( + q.mutable_data>( context.GetPlace(), - size_t(batch_size * m * k * sizeof(phi::funcs::Real))); + size_t(batch_size * m * k * sizeof(phi::dtype::Real))); } - r.mutable_data>( + r.mutable_data>( context.GetPlace(), - size_t(batch_size * k * n * sizeof(phi::funcs::Real))); + size_t(batch_size * k * n * sizeof(phi::dtype::Real))); auto dito = math::DeviceIndependenceTensorOperations { // Note: allocate temporary tensors because of lacking in-place operatios. // Prepare qr Tensor qr; - qr.mutable_data>( + qr.mutable_data>( context.GetPlace(), - size_t(batch_size * m * n * sizeof(phi::funcs::Real))); + size_t(batch_size * m * n * sizeof(phi::dtype::Real))); // BatchedGeqrf performs computation in-place and 'qr' must be a copy of // input paddle::framework::TensorCopy(x, context.GetPlace(), &qr); @@ -126,7 +126,7 @@ class QrGPUKernel : public framework::OpKernel { for (int i = 0; i < batch_size; ++i) { memory::Copy(dev_ctx.GetPlace(), (new_qr_data + i * new_qr_stride), dev_ctx.GetPlace(), (qr_data + i * qr_stride), - qr_stride * sizeof(phi::funcs::Real), + qr_stride * sizeof(phi::dtype::Real), dev_ctx.stream()); } BatchedOrgqr( diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h index cef9371fea099627fd4280f166f013bc84507372..f09a07e96cd34e1b631ef9484fe23b12a3b58543 100644 --- a/paddle/fluid/operators/qr_op.h +++ b/paddle/fluid/operators/qr_op.h @@ -74,19 +74,19 @@ class QrCPUKernel : public framework::OpKernel { int q_stride = m * k; int r_stride = k * n; - auto* x_data = x.data>(); + auto* x_data = x.data>(); T* q_data = nullptr; if (compute_q) { - q_data = q.mutable_data>( + q_data = q.mutable_data>( context.GetPlace(), - size_t(batch_size * m * k * sizeof(phi::funcs::Real))); + size_t(batch_size * m * k * sizeof(phi::dtype::Real))); memset(q_data, 0, - size_t(batch_size * m * k * sizeof(phi::funcs::Real))); + size_t(batch_size * m * k * sizeof(phi::dtype::Real))); } - auto* r_data = r.mutable_data>( + auto* r_data = r.mutable_data>( context.GetPlace(), - size_t(batch_size * k * n * sizeof(phi::funcs::Real))); - memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::funcs::Real))); + size_t(batch_size * k * n * sizeof(phi::dtype::Real))); + memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::dtype::Real))); // Implement QR by calling Eigen for (int i = 0; i < batch_size; ++i) { @@ -142,7 +142,7 @@ class QrGradKernel : public framework::OpKernel { // Use a different name dA instead of dX framework::Tensor& dA = *ctx.Output(framework::GradVarName("X")); - dA.mutable_data>(ctx.GetPlace()); + dA.mutable_data>(ctx.GetPlace()); auto& dev_ctx = ctx.template device_context(); phi::funcs::SetConstant()(dev_ctx, &dA, T(0)); @@ -224,7 +224,7 @@ class QrGradKernel : public framework::OpKernel { } else { // If m < n for input matrices A, we partition A = [X|Y] and R = [U|V] // Calculate dX and dY individually and concatenate them to get dA - dA.mutable_data>(ctx.GetPlace()); + dA.mutable_data>(ctx.GetPlace()); auto Y = dito.Slice(A, {-1}, {m}, {n}); auto U = dito.Slice(R, {-1}, {0}, {m}); diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc index 24741efe426b18b7cecae9332c522d67aee98d63..c7e91ba35dee1356ddd71ade0fe9892f8032c77b 100644 --- a/paddle/fluid/operators/range_op_npu_test.cc +++ b/paddle/fluid/operators/range_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 21c23a7f602a35acf676e97a9134c2c43a73126c..4b6759ea165edf29add66ee44461fdd4d9f84d00 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -70,9 +70,25 @@ BufferedReader::BufferedReader( stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx); } #endif + +#ifdef PADDLE_WITH_MLU + if (platform::is_mlu_place(place_)) { + int dev_idx = place_.device; + compute_stream_ = + ((platform::MLUDeviceContext *)(platform::DeviceContextPool::Instance() + .Get(place_))) + ->stream(); + events_.resize(buffer_size); + for (auto &event : events_) { + event = platform::MluEventResourcePool::Instance().New(dev_idx); + } + stream_ = platform::MluStreamResourcePool::Instance().New(dev_idx); + } +#endif cpu_buffer_.resize(buffer_size); cuda_buffer_.resize(buffer_size); npu_buffer_.resize(buffer_size); + mlu_buffer_.resize(buffer_size); ReadTillBufferFullAsync(); } @@ -256,6 +272,56 @@ void BufferedReader::ReadAsync(size_t i) { platform::NPUStreamSync(stream_.get()); } #endif + +#ifdef PADDLE_WITH_MLU + if (platform::is_mlu_place(place_)) { + TensorVec &mlu = mlu_buffer_[i]; + if (mlu.empty()) { + mlu.resize(cpu.size()); + } else { + PADDLE_ENFORCE_EQ( + mlu.size(), cpu.size(), + platform::errors::InvalidArgument( + "Input tensor number on MLU and CPU devices are not matched. " + "The number on MLU is %d, on CPU is %d", + mlu.size(), cpu.size())); + } + + std::vector mlu_ptrs; + mlu_ptrs.reserve(cpu.size()); + for (size_t i = 0; i < cpu.size(); ++i) { + mlu[i].Resize(cpu[i].dims()); + mlu[i].set_layout(cpu[i].layout()); + mlu_ptrs.emplace_back(mlu[i].mutable_data(place_, cpu[i].type())); + } + + platform::SetMLUDeviceId(place_.device); + PADDLE_ENFORCE_MLU_SUCCESS( + cnPlaceNotifier(events_[i].get(), compute_stream_)); + PADDLE_ENFORCE_MLU_SUCCESS(cnWaitNotifier(events_[i].get())); + + platform::RecordEvent record_event("BufferedReader:MemoryCopy", + platform::TracerEventType::UserDefined, + 1); + for (size_t i = 0; i < cpu.size(); ++i) { + auto cpu_place = cpu[i].place(); + auto cpu_ptr = cpu[i].data(); + auto mlu_ptr = mlu_ptrs[i]; + auto size = + cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype()); + if ((platform::is_mlu_place(cpu_place))) { + memory::Copy(place_, mlu_ptr, cpu_place, cpu_ptr, size, + stream_.get()); + } else { + memory::Copy(place_, mlu_ptr, cpu_place, cpu_ptr, size, + stream_.get()); + platform::MLUStreamSync(stream_.get()); + } + mlu[i].set_lod(cpu[i].lod()); + } + platform::MLUStreamSync(stream_.get()); + } +#endif return i; })); } @@ -291,6 +357,8 @@ void BufferedReader::ReadNextImpl(std::vector *out) { *out = std::move(cuda_buffer_[i]); } else if (platform::is_npu_place(place_)) { *out = std::move(npu_buffer_[i]); + } else if (platform::is_mlu_place(place_)) { + *out = std::move(mlu_buffer_[i]); } else { *out = std::move(cpu_buffer_[i]); } diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index 3d42486c6df8815aaab8e55e29898700bb74d953..f0f3b6b7f9fdfeb69c46e7122fae5c6cfbf3a169 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -29,6 +29,11 @@ #include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/device/npu/npu_resource_pool.h" #endif +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h" +#endif + namespace paddle { namespace operators { namespace reader { @@ -70,6 +75,7 @@ class BufferedReader : public framework::DecoratedReader { std::vector cpu_buffer_; std::vector cuda_buffer_; std::vector npu_buffer_; + std::vector mlu_buffer_; size_t prev_pos_{-1UL}; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuStream_t compute_stream_; @@ -82,6 +88,12 @@ class BufferedReader : public framework::DecoratedReader { std::shared_ptr stream_; std::vector> events_; #endif + +#ifdef PADDLE_WITH_MLU + mluStream compute_stream_; + std::shared_ptr stream_; + std::vector> events_; +#endif }; } // namespace reader diff --git a/paddle/fluid/operators/real_op.cc b/paddle/fluid/operators/real_op.cc index 1f3691978b577e2023eb4f784f2327752855b9b7..18e444702fbb2cc19912a32587f96330e6e8632d 100644 --- a/paddle/fluid/operators/real_op.cc +++ b/paddle/fluid/operators/real_op.cc @@ -82,8 +82,8 @@ DECLARE_INPLACE_OP_INFERER(RealGradOpInplaceInferer, } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor, + PD_INFER_META(phi::RealAndImagInferMeta)); namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu b/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu index e8e4ff7010d3df01cda514d51796b789ef5e1da6..a724524716be39e554c6046ca809624b7fbb053a 100644 --- a/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu +++ b/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu @@ -39,9 +39,9 @@ TEST(test_reduce_rank_check, all) { } if (is_valid) { - phi::kernels::details::CheckReduceRank(reduce_rank, rank); + phi::funcs::details::CheckReduceRank(reduce_rank, rank); } else { - ASSERT_THROW(phi::kernels::details::CheckReduceRank(reduce_rank, rank), + ASSERT_THROW(phi::funcs::details::CheckReduceRank(reduce_rank, rank), paddle::platform::EnforceNotMet); } } diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc index cb438b4a8057267015c8b3c15dd8468fca5a4b44..41df8e4a15f093a40a31c70eea98dfb7e575f4cd 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc @@ -14,15 +14,28 @@ #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" -REGISTER_REDUCE_OP(reduce_max); -REGISTER_OP_CPU_KERNEL( - reduce_max, ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +namespace ops = paddle::operators; + +class ReduceMaxOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_max"; } + virtual std::string GetOpType() const { return "Reduce reduce_max"; } +}; + +DECLARE_INFER_SHAPE_FUNCTOR(reduce_max, ReduceMaxInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); + +REGISTER_OPERATOR( + reduce_max, ops::ReduceOp, ReduceMaxOpMaker, + paddle::framework::DefaultGradOpMaker, + paddle::framework::DefaultGradOpMaker, + ReduceMaxInferShapeFunctor); +REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp) + REGISTER_OP_CPU_KERNEL( reduce_max_grad, ops::ReduceGradKernel, diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu deleted file mode 100644 index 8194805ddc3736b365667883447cc13d7b729494..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" - -// reduce_max -REGISTER_OP_CUDA_KERNEL( - reduce_max, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc index 7e02f0268b5e510ac8262543db58ee98ef20e517..1abec24c0d3ef9dc42739b90f775566a8737b852 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc @@ -27,11 +27,11 @@ class ReduceMaxMLUKernel : public framework::OpKernel { int out_dtype = context.Attr("out_dtype"); bool reduce_all = context.Attr("reduce_all"); auto dims = context.Attr>("dim"); - auto input_dims = framework::vectorize(input->dims()); + auto input_dims = input->dims(); const auto& input_dim_size = input->dims().size(); std::vector reduce_dims; if (reduce_all) { - for (size_t i = 0; i < input_dims.size(); i++) { + for (int i = 0; i < input_dims.size(); i++) { reduce_dims.push_back(static_cast(i)); } } else { diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc index e80df5f95bb4ab33a6c08cc646d0ef8311e38936..4a18330913803f822436118a35fb957b7e31b391 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc @@ -18,6 +18,10 @@ #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { @@ -92,9 +96,13 @@ class __reduce_meanMaker__ : public ops::ReduceOpMaker { virtual std::string GetOpType() const { return "Reduce reduce_mean"; } }; +DECLARE_INFER_SHAPE_FUNCTOR(reduce_mean, ReduceMeanInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); + REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, __reduce_meanMaker__, ops::ReduceMeanOpGradMaker, - ops::ReduceMeanOpGradMaker); + ops::ReduceMeanOpGradMaker, + ReduceMeanInferShapeFunctor); REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp, ops::ReduceMeanDoubleGradDescMaker, ops::ReduceMeanDoubleGradOpBaseMaker, diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc index daf5965fd54628a097ad1d53057ec54b9a5d329a..d80cce742210f1fb7ca6cda977e9f5b455f1a84b 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc @@ -27,11 +27,11 @@ class ReduceMinMLUKernel : public framework::OpKernel { int out_dtype = context.Attr("out_dtype"); bool reduce_all = context.Attr("reduce_all"); auto dims = context.Attr>("dim"); - auto input_dims = framework::vectorize(input->dims()); + auto input_dims = input->dims(); const auto& input_dim_size = input->dims().size(); std::vector reduce_dims; if (reduce_all) { - for (size_t i = 0; i < input_dims.size(); i++) { + for (int i = 0; i < input_dims.size(); i++) { reduce_dims.push_back(static_cast(i)); } } else { diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 3aab906804f7adb95f80aa2675f01217b0b48d39..160617695338a9f2e140b7b418c93ef0d7c57e17 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -23,8 +23,7 @@ #include "paddle/fluid/framework/tensor.h" #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/gpu/reduce.h" - +#include "paddle/phi/kernels/funcs/reduce_function.h" namespace paddle { namespace operators { @@ -37,9 +36,9 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx, gpuStream_t stream) { y->mutable_data(x.place()); - phi::kernels::TensorReduceImpl( + phi::funcs::ReduceKernel( static_cast(dev_ctx), x, y, transform, - origin_reduce_dims, stream); + origin_reduce_dims); } } // namespace operators diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc index 50df75d9ad3fd78ece196e5b7cc76eafe42e1d2d..eb745ab9c56c5b3cfa62eb36713ebc2485282d6d 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc @@ -27,15 +27,7 @@ class CPUDeviceContext; } // namespace paddle REGISTER_REDUCE_OP(reduce_prod); -REGISTER_OP_CPU_KERNEL(reduce_prod, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); + REGISTER_OP_CPU_KERNEL(reduce_prod_grad, ops::ReduceGradKernel, diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.h b/paddle/fluid/operators/reduce_ops/reduce_prod_op.h index 103e108e4bda1c33434ec0c5d6c58f24fa725f57..60dedf8d6ffb0706f8ec9ac2130b6b51067df918 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.h @@ -19,13 +19,6 @@ namespace paddle { namespace operators { -struct ProdFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->prod(dim); - } -}; - struct ProdGradFunctor { template diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc index bdab14a18a05ab3e0df1dbda57f3753033cfacb4..2a78774f3706e73bd8931e80fe020faac58d7ff5 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc @@ -16,6 +16,10 @@ #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace framework { class OpDesc; @@ -98,24 +102,15 @@ class ReduceSumOpMaker : public ops::ReduceOpMaker { virtual std::string GetOpType() const { return "Reduce reduce_sum"; } }; +DECLARE_INFER_SHAPE_FUNCTOR(reduce_sum, ReduceSumInferShapeFunctor, + PD_INFER_META(phi::SumRawInferMeta)); + REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker, ops::ReduceSumVarTypeInference, ops::ReduceSumOpGradMaker, - ops::ReduceSumOpGradMaker); + ops::ReduceSumOpGradMaker, + ReduceSumInferShapeFunctor); REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp, ops::ReduceSumDoubleOpGradMaker, ops::ReduceSumDoubleOpGradMaker, ops::ReduceSumGradNoNeedBufferVarInferer); - -template -using CPUReduceSumGradKernel = - ops::ReduceSumGradKernel; - -REGISTER_OP_CPU_KERNEL( - reduce_sum_grad, CPUReduceSumGradKernel, - CPUReduceSumGradKernel, CPUReduceSumGradKernel, - CPUReduceSumGradKernel, - CPUReduceSumGradKernel, CPUReduceSumGradKernel, - CPUReduceSumGradKernel>, - CPUReduceSumGradKernel>); diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu deleted file mode 100644 index c3d3e0cf6ecd51f3bb2baa063878f80444db3563..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h" - -template -using CUDAReduceSumGradKernel = - ops::ReduceCudaGradKernel; - -REGISTER_OP_CUDA_KERNEL( - reduce_sum_grad, CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel, CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel, CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel>, - CUDAReduceSumGradKernel>); diff --git a/paddle/fluid/operators/rnn_op.h b/paddle/fluid/operators/rnn_op.h index c18570af775cc88d7c54de7899d7359f791b8b08..a473b54c1f855945a5f3f0ac8d0826b15494ba1a 100644 --- a/paddle/fluid/operators/rnn_op.h +++ b/paddle/fluid/operators/rnn_op.h @@ -16,17 +16,17 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/math/concat_and_split.h" -#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/fc.h" -#include "paddle/fluid/operators/math/gru_compute.h" -#include "paddle/fluid/operators/math/lstm_compute.h" #include "paddle/fluid/operators/unique_op.h" #include "paddle/fluid/operators/utils.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/detail/activation_functions.h" +#include "paddle/phi/kernels/funcs/gru_compute.h" +#include "paddle/phi/kernels/funcs/lstm_compute.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -36,6 +36,14 @@ using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; using TensorList = std::vector; +template +using EigenMatrix = framework::EigenMatrix; + +template +using EigenVector = framework::EigenVector; + #define DEFINE_MODE_DETECTOR(MODE_NAME, MODE_STR) \ inline bool is_##MODE_NAME(const framework::ExecutionContext& ctx) { \ const std::string& mode = ctx.Attr("mode"); \ @@ -100,7 +108,7 @@ struct Cell { }; template class EigenActivationFunctor, - math::detail::ActivationType act_type> + phi::funcs::detail::ActivationType act_type> struct SimpleRNNCell : Cell { void operator()(const platform::CPUDeviceContext* device_ctx, Tensor* input, const Tensor* weight_hh, const Tensor* init_h, @@ -148,7 +156,7 @@ struct GRUCell : Cell { size_t frame_size = init_h->dims()[2]; size_t batch_size = init_h->dims()[1]; - math::GRUMetaValue gru_value; + phi::funcs::GRUMetaValue gru_value; gru_value.gate_weight = weight_hh->data(); gru_value.state_weight = weight_hh->data() + 2 * frame_size * frame_size; gru_value.reset_bias = bias_hh->data() + 2 * frame_size; @@ -158,10 +166,10 @@ struct GRUCell : Cell { gru_value.output_value = output->data(); gru_value.prev_out_value = init_h->data(); - auto gate_act = math::detail::GetActivationType("sigmoid_v2"); - auto cand_act = math::detail::GetActivationType("tanh_v2"); + auto gate_act = phi::funcs::detail::GetActivationType("sigmoid_v2"); + auto cand_act = phi::funcs::detail::GetActivationType("tanh_v2"); - math::GRUUnitFunctorV2::compute( + phi::funcs::GRUUnitFunctorV2::compute( *device_ctx, gru_value, frame_size, batch_size, cand_act, gate_act); } }; @@ -184,14 +192,14 @@ struct LSTMCell : Cell { blas.MatMul(*init_h, mat_dim_a, *weight_hh, mat_dim_b, static_cast(1.0), input, static_cast(1.0)); - math::LstmMetaValue lstm_value; + phi::funcs::LstmMetaValue lstm_value; lstm_value.check_ig = nullptr; lstm_value.check_fg = nullptr; lstm_value.check_og = nullptr; - auto gate_act = math::detail::GetActivationType("sigmoid_v2"); - auto cell_act = math::detail::GetActivationType("tanh_v2"); - auto cand_act = math::detail::GetActivationType("tanh_v2"); + auto gate_act = phi::funcs::detail::GetActivationType("sigmoid_v2"); + auto cell_act = phi::funcs::detail::GetActivationType("tanh_v2"); + auto cand_act = phi::funcs::detail::GetActivationType("tanh_v2"); size_t frame_size = init_h->dims()[2]; size_t batch_size = init_h->dims()[1]; @@ -208,7 +216,7 @@ struct LSTMCell : Cell { lstm_value.state_value = last_c->data(); lstm_value.state_active_value = last_c_act->data(); T cell_clip = 0.0; - math::LstmUnitFunctor::compute( + phi::funcs::LstmUnitFunctor::compute( *device_ctx, lstm_value, frame_size, batch_size, cell_clip, gate_act, cell_act, cand_act, false); } @@ -986,18 +994,18 @@ class RNNCPUKernel : public framework::OpKernel { seed, reserve_data); } else if (is_rnn_relu(ctx)) { gate_num = 1; - RnnFunc< - SimpleRNNCell, - Layer, SingleLayer, BidirLayer, T>( + RnnFunc, + Layer, SingleLayer, BidirLayer, T>( ctx, input, weight_list, pre_state[0], nullptr, sequence_length, state[0], nullptr, output, dropout_mask, num_layers, gate_num, input_size, hidden_size, is_bidirec, mode, dropout_prob, is_test, seed, reserve_data); } else if (is_rnn_tanh(ctx)) { gate_num = 1; - RnnFunc< - SimpleRNNCell, - Layer, SingleLayer, BidirLayer, T>( + RnnFunc, + Layer, SingleLayer, BidirLayer, T>( ctx, input, weight_list, pre_state[0], nullptr, sequence_length, state[0], nullptr, output, dropout_mask, num_layers, gate_num, input_size, hidden_size, is_bidirec, mode, dropout_prob, is_test, @@ -1014,14 +1022,14 @@ class RNNCPUKernel : public framework::OpKernel { }; template -void create_lstm_value(math::LstmMetaValue* lstm_value) { +void create_lstm_value(phi::funcs::LstmMetaValue* lstm_value) { lstm_value->check_ig = nullptr; lstm_value->check_fg = nullptr; lstm_value->check_og = nullptr; } template -void create_lstm_grad(math::LstmMetaGrad* lstm_grad) { +void create_lstm_grad(phi::funcs::LstmMetaGrad* lstm_grad) { lstm_grad->check_ig_grad = nullptr; lstm_grad->check_fg_grad = nullptr; lstm_grad->check_og_grad = nullptr; @@ -1686,8 +1694,8 @@ struct GRUGradCell : GradCell { // zero pre_hidden phi::funcs::SetConstant zero; zero(device_ctx, grad_pre_hidden, static_cast(0.0)); - math::GRUMetaValue gru_value; - math::GRUMetaGrad gru_grad; + phi::funcs::GRUMetaValue gru_value; + phi::funcs::GRUMetaGrad gru_grad; gru_value.gate_value = gate_tensor->data(); gru_value.prev_out_value = pre_hidden->data(); gru_value.reset_output_value = state_tensor->data(); @@ -1703,9 +1711,9 @@ struct GRUGradCell : GradCell { grad_weight_hh->data() + 2 * frame_size * frame_size; gru_grad.bias_hh_grad = grad_bias_hh->data(); - auto act_gate = math::detail::GetActivationType("sigmoid_v2"); - auto act_node = math::detail::GetActivationType("tanh_v2"); - math::GRUUnitGradFunctorV2::compute( + auto act_gate = phi::funcs::detail::GetActivationType("sigmoid_v2"); + auto act_node = phi::funcs::detail::GetActivationType("tanh_v2"); + phi::funcs::GRUUnitGradFunctorV2::compute( device_ctx, gru_value, gru_grad, frame_size, batch_size, act_node, act_gate); @@ -1738,8 +1746,8 @@ struct LSTMGradCell : GradCell { backup_tensor(context, &grad_pre_state_bak, grad_pre_state); } - math::LstmMetaValue lstm_value; - math::LstmMetaGrad lstm_grad; + phi::funcs::LstmMetaValue lstm_value; + phi::funcs::LstmMetaGrad lstm_grad; create_lstm_value(&lstm_value); create_lstm_grad(&lstm_grad); lstm_value.gate_value = gate_tensor->data(); @@ -1755,12 +1763,12 @@ struct LSTMGradCell : GradCell { lstm_value.output_value = nullptr; lstm_grad.state_active_grad = nullptr; - auto gate_act = math::detail::GetActivationType("sigmoid_v2"); - auto state_act = math::detail::GetActivationType("tanh_v2"); - auto cand_act = math::detail::GetActivationType("tanh_v2"); + auto gate_act = phi::funcs::detail::GetActivationType("sigmoid_v2"); + auto state_act = phi::funcs::detail::GetActivationType("tanh_v2"); + auto cand_act = phi::funcs::detail::GetActivationType("tanh_v2"); T cell_clip = 0.0; - math::LstmUnitGradFunctor::compute( + phi::funcs::LstmUnitGradFunctor::compute( device_ctx, lstm_value, lstm_grad, frame_size, batch_size, cell_clip, gate_act, state_act, cand_act, false); this->update_pre_hidden_grad( diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc index 6da73c99068bc0e0453dfdd1b5eca8e1add1954b..7fe6623dcca14afc8fafc4875ccfb7546e4456f0 100644 --- a/paddle/fluid/operators/save_combine_op.cc +++ b/paddle/fluid/operators/save_combine_op.cc @@ -38,7 +38,8 @@ class SaveCombineOp : public framework::OperatorWithKernel { framework::OpKernelType GetKernelTypeForVar( const std::string& var_name, const Tensor& tensor, const framework::OpKernelType& expected_kernel_type) const override { - return expected_kernel_type; + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place()); } }; diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index e4410b21b541320c1d39c3ad155dfce6f74b7dc2..cbf2b9152079e13acd4a221ece402b946b844999 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -121,8 +121,8 @@ DECLARE_INPLACE_OP_INFERER(ScaleOpInplaceInferer, {"X", "Out"}); namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(scale, ScaleInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(scale, ScaleInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker, ops::ScaleGradMaker, diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc index bb02bb541e14f551bb749c890877e4753d225c3c..0ae0e1500c16627fc269b31c57b25c47055d7d34 100644 --- a/paddle/fluid/operators/scatter_nd_add_op.cc +++ b/paddle/fluid/operators/scatter_nd_add_op.cc @@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/scatter_nd_add_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -24,73 +27,6 @@ class ScatterNdAddOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of ScatterNdAddOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("Index"), true, - platform::errors::InvalidArgument( - "Input(Index) of ScatterNdAddOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("Updates"), true, - platform::errors::InvalidArgument( - "Input(Updates) of ScatterNdAddOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of ScatterNdAddOp should not be null.")); - - auto ref_dims = ctx->GetInputDim("X"); - auto ref_dims_size = ref_dims.size(); - auto index_dims = ctx->GetInputDim("Index"); - auto index_dims_size = index_dims.size(); - auto updates_dims = ctx->GetInputDim("Updates"); - auto updates_dims_size = updates_dims.size(); - - PADDLE_ENFORCE_LE( - index_dims[index_dims_size - 1], ref_dims_size, - platform::errors::InvalidArgument( - "The last dimension of Input(Index)'s shape should be no greater " - "than the rank of Input(X), but received the last dimension of " - "Input(Index)'s shape is %d, the rank of Input(X) is %d.", - index_dims[index_dims_size - 1], ref_dims_size)); - PADDLE_ENFORCE_GE(index_dims_size, 2UL, - platform::errors::InvalidArgument( - "The rank of Input(Index) should be greater than 1, " - "but received the rank of Input(Index) is %d.", - index_dims_size)); - - // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:] - std::vector r_updates_dims; - for (int64_t i = 0; i < index_dims_size - 1; ++i) { - r_updates_dims.emplace_back(index_dims[i]); - } - for (int64_t i = index_dims[index_dims_size - 1]; i < ref_dims_size; ++i) { - r_updates_dims.emplace_back(ref_dims[i]); - } - - PADDLE_ENFORCE_EQ( - r_updates_dims.size(), updates_dims_size, - platform::errors::InvalidArgument( - "Updates has wrong shape. The shape of Updates and Input(Updates) " - "should be same, but received the shape of Updates is %d, " - "the shape of Input(Updates) is %d.", - r_updates_dims.size(), updates_dims_size)); - - for (int64_t i = 0; i < updates_dims_size; ++i) { - PADDLE_ENFORCE_EQ( - r_updates_dims[i], updates_dims[i], - platform::errors::InvalidArgument( - "Updates has wrong shape. The dimensions of Updates and " - "Input(Updates) should match, but received Updates's" - "%d-th dimension is %d, Input(Updates)'s %d-th " - "dimension is %d.", - i, r_updates_dims[i], i, updates_dims[i])); - } - ctx->SetOutputDim("Out", ref_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -99,7 +35,8 @@ class ScatterNdAddOp : public framework::OperatorWithKernel { platform::errors::InvalidArgument( "Ref and Updates must have same type")); return framework::OpKernelType( - framework::TransToProtoVarType(ctx.Input("X")->type()), + framework::TransToProtoVarType( + ctx.Input("X")->type()), ctx.device_context()); } }; @@ -108,17 +45,6 @@ class ScatterNdAddGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - if (ctx->HasOutput(framework::GradVarName("Updates"))) { - ctx->SetOutputDim(framework::GradVarName("Updates"), - ctx->GetInputDim("Updates")); - } - if (ctx->HasOutput(framework::GradVarName("X"))) { - ctx->SetOutputDim(framework::GradVarName("X"), - ctx->GetInputDim(framework::GradVarName("Out"))); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -193,22 +119,18 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ScatterNdAddGradNoNeedBufferVarsInferer, namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(scatter_nd_add, ScatterNdAddInferShapeFunctor, + PD_INFER_META(phi::ScatterNdAddInferMeta)); + +DECLARE_INFER_SHAPE_FUNCTOR(scatter_nd_add_grad, + ScatterNdAddGradInferShapeFunctor, + PD_INFER_META(phi::ScatterNdAddGradInferMeta)); + REGISTER_OPERATOR(scatter_nd_add, ops::ScatterNdAddOp, ops::ScatterNdAddOpMaker, ops::ScatterNdAddGradMaker, - ops::ScatterNdAddGradMaker); + ops::ScatterNdAddGradMaker, + ScatterNdAddInferShapeFunctor); REGISTER_OPERATOR(scatter_nd_add_grad, ops::ScatterNdAddGradOp, - ops::ScatterNdAddGradNoNeedBufferVarsInferer); - -REGISTER_OP_CPU_KERNEL(scatter_nd_add, ops::ScatterNdAddOpKernel, - ops::ScatterNdAddOpKernel, - ops::ScatterNdAddOpKernel, - ops::ScatterNdAddOpKernel, - ops::ScatterNdAddOpKernel); - -REGISTER_OP_CPU_KERNEL(scatter_nd_add_grad, - ops::ScatterNdAddGradientOpKernel, - ops::ScatterNdAddGradientOpKernel, - ops::ScatterNdAddGradientOpKernel, - ops::ScatterNdAddGradientOpKernel, - ops::ScatterNdAddGradientOpKernel); + ops::ScatterNdAddGradNoNeedBufferVarsInferer, + ScatterNdAddGradInferShapeFunctor); diff --git a/paddle/fluid/operators/scatter_nd_add_op.cu b/paddle/fluid/operators/scatter_nd_add_op.cu deleted file mode 100644 index 6448f8cc4056d2c11806c1c342df57d597e606ba..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/scatter_nd_add_op.cu +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/gather.cu.h" -#include "paddle/fluid/operators/gather_op.h" -#include "paddle/fluid/operators/scatter.cu.h" -#include "paddle/fluid/operators/scatter_nd_add_op.h" - -namespace paddle { -namespace operators { - -template -class ScatterNdAddOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *X = ctx.Input("X"); - auto *Ids = ctx.Input("Index"); - auto *Updates = ctx.Input("Updates"); - auto *Out = ctx.Output("Out"); - - framework::TensorCopySync(*X, ctx.GetPlace(), Out); - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s], but " - "desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - GPUScatterNdAdd(ctx, *Updates, *Ids, Out); - } else { - GPUScatterNdAdd(ctx, *Updates, *Ids, Out); - } - } -}; - -template -class ScatterNdAddGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); - auto *Ids = ctx.Input("Index"); - auto *dOut = ctx.Input(framework::GradVarName("Out")); - if (dX) { - framework::TensorCopy(*dOut, ctx.GetPlace(), dX); - } - if (dUpdates) { - dUpdates->mutable_data(ctx.GetPlace()); - // Gradient by Gather - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - if (index_type == framework::proto::VarType::INT32) { - GPUGatherNd(ctx, *dOut, *Ids, dUpdates); - } else { - GPUGatherNd(ctx, *dOut, *Ids, dUpdates); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(scatter_nd_add, - ops::ScatterNdAddOpCUDAKernel, - ops::ScatterNdAddOpCUDAKernel, - ops::ScatterNdAddOpCUDAKernel, - ops::ScatterNdAddOpCUDAKernel, - ops::ScatterNdAddOpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL(scatter_nd_add_grad, - ops::ScatterNdAddGradOpCUDAKernel, - ops::ScatterNdAddGradOpCUDAKernel, - ops::ScatterNdAddGradOpCUDAKernel, - ops::ScatterNdAddGradOpCUDAKernel, - ops::ScatterNdAddGradOpCUDAKernel); diff --git a/paddle/fluid/operators/scatter_nd_add_op.h b/paddle/fluid/operators/scatter_nd_add_op.h deleted file mode 100644 index 2bdf9ec58a850ea59f7f0697bc5d0eadde0adc99..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/scatter_nd_add_op.h +++ /dev/null @@ -1,90 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" -#include "paddle/fluid/operators/scatter.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class ScatterNdAddOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - auto *X = ctx.Input("X"); - auto *Ids = ctx.Input("Index"); - auto *Updates = ctx.Input("Updates"); - auto *Out = ctx.Output("Out"); - - // In place output: Out = X - framework::TensorCopySync(*X, ctx.GetPlace(), Out); - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s], but " - "desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - if (index_type == framework::proto::VarType::INT32) { - ScatterNdAdd(ctx, *Updates, *Ids, Out); - } else { - ScatterNdAdd(ctx, *Updates, *Ids, Out); - } - } -}; - -template -class ScatterNdAddGradientOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); - auto *Ids = ctx.Input("Index"); - auto *dOut = ctx.Input(framework::GradVarName("Out")); - - if (dX) { - framework::TensorCopy(*dOut, ctx.GetPlace(), dX); - } - if (dUpdates) { - dUpdates->mutable_data(ctx.GetPlace()); - // Gradient by Gather: dUpdates = dO[Ids] - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - if (index_type == framework::proto::VarType::INT32) { - CPUGatherNd(ctx.device_context(), *dOut, *Ids, dUpdates); - } else { - CPUGatherNd(ctx.device_context(), *dOut, *Ids, dUpdates); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc index 3174f07e96e227c8a2f1103d3d6664673c7a2d56..5f6b04cf59e0e3c8c05d44ad6c4a3321ff2516e4 100644 --- a/paddle/fluid/operators/scatter_op.cc +++ b/paddle/fluid/operators/scatter_op.cc @@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/scatter_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -23,46 +26,6 @@ class ScatterOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of ScatterOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Ids"), true, - platform::errors::InvalidArgument( - "Input(Ids) of ScatterOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Updates"), true, - platform::errors::InvalidArgument( - "Input(Updates) of ScatterOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of ScatterOp should not be null.")); - - auto updates_dims = ctx->GetInputDim("Updates"); - auto ref_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ( - ctx->GetInputDim("Ids").size(), 1, - platform::errors::InvalidArgument( - "The size of Input(Ids)'s shape should be equal to 1, but " - "received the rank of Input(Ids) is %d.", - ctx->GetInputDim("Ids").size())); - PADDLE_ENFORCE_EQ( - ref_dims.size(), updates_dims.size(), - platform::errors::InvalidArgument( - "Input(X) and Input(Updates) should have the same shape size, " - "but received the size of Input(x)'s shape is %d, the size of " - "Input(Updates)'s shape is %d.", - ref_dims.size(), updates_dims.size())); - PADDLE_ENFORCE_EQ( - ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0], - platform::errors::InvalidArgument( - "Input(Updates) and Input(Ids) should have same batch-size, but" - " received Input(Updates)'s batch-size is %d, Input(Ids)'s " - "batch-size is %d.", - ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0])); - ctx->SetOutputDim("Out", ref_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -76,17 +39,6 @@ class ScatterGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - if (ctx->HasOutput(framework::GradVarName("Updates"))) { - ctx->SetOutputDim(framework::GradVarName("Updates"), - ctx->GetInputDim("Updates")); - } - if (ctx->HasOutput(framework::GradVarName("X"))) { - ctx->SetOutputDim(framework::GradVarName("X"), - ctx->GetInputDim(framework::GradVarName("Out"))); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -151,17 +103,17 @@ DECLARE_INPLACE_OP_INFERER(ScatterInplaceInferer, {"X", "Out"}); } // namespace operators } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(scatter, ScatterInferShapeFunctor, + PD_INFER_META(phi::ScatterInferMeta)); + +DECLARE_INFER_SHAPE_FUNCTOR(scatter_grad, ScatterGradInferShapeFunctor, + PD_INFER_META(phi::ScatterGradInferMeta)); + namespace ops = paddle::operators; REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker, ops::ScatterGradMaker, ops::ScatterGradMaker, - ops::ScatterInplaceInferer); + ops::ScatterInplaceInferer, ScatterInferShapeFunctor); REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp, - ops::ScatterGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel, - ops::ScatterOpKernel, ops::ScatterOpKernel, - ops::ScatterOpKernel); -REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel, - ops::ScatterGradientOpKernel, - ops::ScatterGradientOpKernel, - ops::ScatterGradientOpKernel); + ops::ScatterGradNoNeedBufferVarsInferer, + ScatterGradInferShapeFunctor); diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu deleted file mode 100644 index 549e30803b4647e3e107b0d16147c472c0dcb226..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/scatter_op.cu +++ /dev/null @@ -1,120 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/gather.cu.h" -#include "paddle/fluid/operators/gather_op.h" -#include "paddle/fluid/operators/scatter.cu.h" -#include "paddle/fluid/operators/scatter_op.h" - -namespace paddle { -namespace operators { - -template -class ScatterOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *X = ctx.Input("X"); - auto *Ids = ctx.Input("Ids"); - auto *Updates = ctx.Input("Updates"); - auto *Out = ctx.Output("Out"); - bool overwrite = ctx.Attr("overwrite"); - - framework::TensorCopy(*X, ctx.GetPlace(), Out); - // use template class to support int32_t and int64_t - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "scatter_op Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - GPUScatterAssign(ctx, *Updates, *Ids, Out, overwrite); - } else { - GPUScatterAssign(ctx, *Updates, *Ids, Out, overwrite); - } - } -}; - -template -class ScatterGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); - auto *Ids = ctx.Input("Ids"); - auto *dOut = ctx.Input(framework::GradVarName("Out")); - - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "scatter_op index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s]", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - if (dX) { - framework::TensorCopy(*dOut, ctx.GetPlace(), dX); - if (index_type == framework::proto::VarType::INT32) { - GPUScatterGradForX(ctx.device_context(), *Ids, dX); - } else { - GPUScatterGradForX(ctx.device_context(), *Ids, dX); - } - } - - if (dUpdates) { - dUpdates->mutable_data(ctx.GetPlace()); - // Gradient by Gather: dUpdates = dO[Ids] - if (index_type == framework::proto::VarType::INT32) { - GPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); - } else { - GPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(scatter, ops::ScatterOpCUDAKernel, - ops::ScatterOpCUDAKernel, - ops::ScatterOpCUDAKernel, - ops::ScatterOpCUDAKernel, - ops::ScatterOpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL( - scatter_grad, ops::ScatterGradOpCUDAKernel, - ops::ScatterGradOpCUDAKernel, ops::ScatterOpCUDAKernel, - ops::ScatterOpCUDAKernel, - ops::ScatterGradOpCUDAKernel); diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h deleted file mode 100644 index 69ab6c7135cd55468bbe8a4c65d45a466b8eaa75..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/scatter_op.h +++ /dev/null @@ -1,118 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" -#include "paddle/fluid/operators/scatter.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class ScatterOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - auto *X = ctx.Input("X"); - auto *Ids = ctx.Input("Ids"); - auto *Updates = ctx.Input("Updates"); - auto *Out = ctx.Output("Out"); - double overwrite = ctx.Attr("overwrite"); - - // In place output: Out = X, Out[Ids] = Updates - framework::TensorCopy(*X, ctx.GetPlace(), Out); - // Apply ScatterUpdate: Out[index] = Updates[:] - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (overwrite) { - if (index_type == framework::proto::VarType::INT32) { - ScatterAssign(ctx.device_context(), *Updates, *Ids, Out); - } else { - ScatterAssign(ctx.device_context(), *Updates, *Ids, Out); - } - } else { - if (index_type == framework::proto::VarType::INT32) { - ScatterAssignAdd(ctx, *Updates, *Ids, Out); - } else { - ScatterAssignAdd(ctx, *Updates, *Ids, Out); - } - } - } -}; - -template -class ScatterGradientOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); - auto *Ids = ctx.Input("Ids"); - auto *dOut = ctx.Input(framework::GradVarName("Out")); - - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "scatter_op index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s]", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - if (dX) { - framework::TensorCopy(*dOut, ctx.GetPlace(), dX); - if (index_type == framework::proto::VarType::INT32) { - CPUScatterGradForX(ctx.device_context(), *Ids, dX); - } else { - CPUScatterGradForX(ctx.device_context(), *Ids, dX); - } - } - - if (dUpdates) { - dUpdates->mutable_data(ctx.GetPlace()); - // Gradient by Gather: dUpdates = dO[Ids] - if (index_type == framework::proto::VarType::INT32) { - CPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); - } else { - CPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc index fa5f03a092882ec1f63e9556bc38d94ed40c9a7f..815984ac307fdce14a64f01a661b4b7f7ce1d616 100644 --- a/paddle/fluid/operators/scatter_op_npu.cc +++ b/paddle/fluid/operators/scatter_op_npu.cc @@ -17,7 +17,6 @@ limitations under the License. */ #include #include "paddle/fluid/operators/kron_op.h" -#include "paddle/fluid/operators/scatter_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/scatter_op_xpu.cc b/paddle/fluid/operators/scatter_op_xpu.cc index 9f0b74e8a3f80c5c8a22c2db109f75e6ee316be1..07dd2f2d85fe9ac330be1f85d283c85207b1b78c 100644 --- a/paddle/fluid/operators/scatter_op_xpu.cc +++ b/paddle/fluid/operators/scatter_op_xpu.cc @@ -16,7 +16,10 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/scatter_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc index 0a4cab5fac1abe92b2b2457098d71a7dc3624910..93f2d60e5f232767f8e604ca98e3c39fc00caf8b 100644 --- a/paddle/fluid/operators/scatter_test.cc +++ b/paddle/fluid/operators/scatter_test.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/scatter.h" +#include "paddle/phi/kernels/funcs/scatter.h" #include @@ -43,7 +43,7 @@ TEST(scatter, ScatterUpdate) { auto* cpu_place = new paddle::platform::CPUPlace(); paddle::platform::CPUDeviceContext ctx(*cpu_place); - paddle::operators::ScatterAssign(ctx, src, index, &output); + phi::funcs::ScatterAssign(ctx, src, index, &output); for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], 0.0f); for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output.data()[i], 0.0f); diff --git a/paddle/fluid/operators/segment_pool_op.cc b/paddle/fluid/operators/segment_pool_op.cc index 322cd97f01c3ad97ba74f049696fdec592ee524e..9d4c8532a82c064b1b7aef759934ad8dad894ec5 100644 --- a/paddle/fluid/operators/segment_pool_op.cc +++ b/paddle/fluid/operators/segment_pool_op.cc @@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/segment_pool_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -23,22 +26,6 @@ class SegmentPoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SegmentPool"); - OP_INOUT_CHECK(ctx->HasInput("SegmentIds"), "Input", "SegmentIds", - "SegmentPool"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SegmentPool"); - auto dims = ctx->GetInputDim("X"); - dims[0] = -1; - ctx->SetOutputDim("Out", dims); - - if (ctx->Attrs().Get("pooltype") == "MEAN") { - OP_INOUT_CHECK(ctx->HasOutput("SummedIds"), "Output", "SummedIds", - "SegmentPool"); - ctx->SetOutputDim("SummedIds", {-1, 1}); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -150,17 +137,11 @@ class SegmentPoolGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(segment_pool, SegmentPoolInferShapeFunctor, + PD_INFER_META(phi::SegmentPoolInferMeta)); + REGISTER_OPERATOR(segment_pool, ops::SegmentPoolOp, ops::SegmentPoolOpMaker, ops::SegmentPoolGradOpMaker, - ops::SegmentPoolGradOpMaker); + ops::SegmentPoolGradOpMaker, + SegmentPoolInferShapeFunctor); REGISTER_OPERATOR(segment_pool_grad, ops::SegmentPoolGradOp); - -REGISTER_OP_CPU_KERNEL( - segment_pool, - ops::SegmentPoolKernel, - ops::SegmentPoolKernel); - -REGISTER_OP_CPU_KERNEL( - segment_pool_grad, - ops::SegmentPoolGradKernel, - ops::SegmentPoolGradKernel); diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu deleted file mode 100644 index 4e20844dc3275f840ff93029abb222e2ef02e0fa..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/segment_pool_op.cu +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/gather.cu.h" -#include "paddle/fluid/operators/segment_pool_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - segment_pool, - ops::SegmentPoolKernel, - ops::SegmentPoolKernel); -REGISTER_OP_CUDA_KERNEL( - segment_pool_grad, - ops::SegmentPoolGradKernel, - ops::SegmentPoolGradKernel); diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h deleted file mode 100644 index 2f5ef7f54f988884a25feba4665283d3ce260988..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/segment_pool_op.h +++ /dev/null @@ -1,176 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/segment_pooling.h" -#include "paddle/fluid/platform/macros.h" -#include "paddle/phi/common/place.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) { - auto* input = context.Input("X"); - auto* segment = context.Input("SegmentIds"); - auto* output = context.Output("Out"); - std::string pooltype = context.Attr("pooltype"); - Tensor* summed_ids = nullptr; - - int64_t num_indices = segment->numel(); - PADDLE_ENFORCE_EQ( - num_indices, input->dims()[0], - platform::errors::InvalidArgument( - "Segment_ids should be the same size as dimension 0 of input X.")); - PADDLE_ENFORCE_EQ(num_indices, segment->dims()[0], - platform::errors::InvalidArgument( - "Segment_ids should be 1-D tensor, or it's other " - "dimension size is 1. Segment_ids's shape is: [%s].", - segment->dims())); - - if (input->numel() == 0 || segment->numel() == 0) { - return; - } - - bool cpu_place = context.GetPlace().GetType() == phi::AllocationType::CPU; - if (cpu_place) { - auto dims = input->dims(); - auto* segment_ids = segment->data(); - dims[0] = static_cast(segment_ids[segment->numel() - 1] + 1); - PADDLE_ENFORCE_GT( - dims[0], 0, - platform::errors::InvalidArgument( - "Segment ids must be >= 0, but got last id %d", dims[0])); - output->Resize({dims}); - output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - set_zero(dev_ctx, output, static_cast(0)); - } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (!cpu_place) { - Tensor length; - length.mutable_data(phi::make_ddim({1}), platform::CPUPlace()); - IndexT* length_data = length.data(); - const IndexT* segment_ids = segment->data(); - -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS( - hipMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT), - hipMemcpyDeviceToHost)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT), - cudaMemcpyDeviceToHost)); -#endif - - IndexT length_host = length_data[0]; - length_host++; - PADDLE_ENFORCE_GT( - length_host, 0, - platform::errors::InvalidArgument( - "Segment ids must be >= 0, but got last id %d", length_data[0])); - auto dims = input->dims(); - dims[0] = static_cast(length_host); - output->Resize({dims}); - output->mutable_data(context.GetPlace()); - T init_value = 0; - if (pooltype == "MAX") { - init_value = static_cast(-FLT_MAX); - } else if (pooltype == "MIN") { - init_value = static_cast(FLT_MAX); - } - phi::funcs::SetConstant setconst; - auto& dev_ctx = context.template device_context(); - setconst(dev_ctx, output, static_cast(init_value)); - // the gpu kernel of mean pool record the counts of segment_ids - if (pooltype == "MEAN") { - summed_ids = context.Output("SummedIds"); - summed_ids->Resize({dims[0], 1}); - summed_ids->mutable_data(context.GetPlace()); - setconst(dev_ctx, summed_ids, static_cast(1e-12)); - } - } -#endif - - SegmentPoolFunctor pool; - - pool(context.template device_context(), *input, *segment, - output, summed_ids, pooltype); -} - -template -class SegmentPoolKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* segment = context.Input("SegmentIds"); - auto index_type = framework::TransToProtoVarType(segment->dtype()); - if (index_type == framework::proto::VarType::INT32) { - SegmentKernelLaunchHelper(context); - } else if (index_type == framework::proto::VarType::INT64) { - SegmentKernelLaunchHelper(context); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported index type, Expected int, int64, but got %s.", - index_type)); - } - } -}; - -template -class SegmentPoolGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("X"); - auto* output = context.Input("Out"); - auto* segment = context.Input("SegmentIds"); - auto* out_g = context.Input(framework::GradVarName("Out")); - auto* in_g = context.Output(framework::GradVarName("X")); - std::string pooltype = context.Attr("pooltype"); - - const Tensor* summed_ids = nullptr; - if (pooltype == "MEAN") { - summed_ids = context.Input("SummedIds"); - } - - in_g->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - set_zero(dev_ctx, in_g, static_cast(0)); - - auto index_type = framework::TransToProtoVarType(segment->dtype()); - if (index_type == framework::proto::VarType::INT32) { - SegmentPoolGradFunctor pool; - pool(context.template device_context(), *input, *output, - *out_g, *segment, in_g, summed_ids, pooltype); - } else if (index_type == framework::proto::VarType::INT64) { - SegmentPoolGradFunctor pool; - pool(context.template device_context(), *input, *output, - *out_g, *segment, in_g, summed_ids, pooltype); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported index type, Expected int, int64, but got %s.", - index_type)); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/selu_op.cc b/paddle/fluid/operators/selu_op.cc index 0adf61d7ce3e5b5792b9dc65d5ac8f884dc81ea5..59c6e16535738ba6cbb3224dd4ff5c2987618cdf 100644 --- a/paddle/fluid/operators/selu_op.cc +++ b/paddle/fluid/operators/selu_op.cc @@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/selu_op.h" - #include #include #include -#include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -30,10 +31,6 @@ class SeluOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - return UnaryOpUnchangedInferShape(ctx); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -123,13 +120,12 @@ class SeluGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(selu, SeluInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); + REGISTER_OPERATOR(selu, ops::SeluOp, ops::SeluOpMaker, ops::SeluOpInferVarType, ops::SeluGradMaker, - ops::SeluGradMaker); + ops::SeluGradMaker, + SeluInferShapeFunctor); + REGISTER_OPERATOR(selu_grad, ops::SeluGradOp); -REGISTER_OP_CPU_KERNEL( - selu, ops::SeluKernel, - ops::SeluKernel); -REGISTER_OP_CPU_KERNEL( - selu_grad, ops::SeluGradKernel, - ops::SeluGradKernel); diff --git a/paddle/fluid/operators/selu_op.cu b/paddle/fluid/operators/selu_op.cu deleted file mode 100644 index fb3245ab7609ea9067709134a3713e9871dbb4d4..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/selu_op.cu +++ /dev/null @@ -1,22 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/selu_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - selu, ops::SeluKernel, - ops::SeluKernel); -REGISTER_OP_CUDA_KERNEL( - selu_grad, ops::SeluGradKernel, - ops::SeluGradKernel); diff --git a/paddle/fluid/operators/selu_op.h b/paddle/fluid/operators/selu_op.h deleted file mode 100644 index b2fc834c42f65ff3521b6267ed2f32fabbab4e4d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/selu_op.h +++ /dev/null @@ -1,123 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math.h" -#include "paddle/fluid/platform/for_range.h" - -namespace paddle { -namespace operators { - -template -struct SeluFunctor { - SeluFunctor(const T* x_data_ptr, float alpha, float scale, T* y_data_ptr) - : x_data_ptr_(x_data_ptr), - alpha_(alpha), - scale_(scale), - y_data_ptr_(y_data_ptr) {} - - HOSTDEVICE void operator()(size_t idx) const { - T x_ele = x_data_ptr_[idx]; - if (x_ele <= 0) { - x_ele = alpha_ * real_exp(x_ele) - alpha_; - } - y_data_ptr_[idx] = scale_ * x_ele; - } - const T* x_data_ptr_; - const float alpha_; - const float scale_; - T* y_data_ptr_; -}; - -template -struct SeluGradFunctor { - SeluGradFunctor(const T* y_data_ptr, const T* dy_data_ptr, float alpha, - float scale, T* dx_data_ptr) - : y_data_ptr_(y_data_ptr), - dy_data_ptr_(dy_data_ptr), - alpha_(alpha), - scale_(scale), - la_(alpha * scale), - dx_data_ptr_(dx_data_ptr) {} - - HOSTDEVICE void operator()(size_t idx) const { - T y_ele = y_data_ptr_[idx]; - T dy_ele = dy_data_ptr_[idx]; - - float tmp = scale_; - if (y_ele <= 0) { - tmp = y_ele + la_; - } - dx_data_ptr_[idx] = dy_ele * tmp; - } - const T* y_data_ptr_; - const T* dy_data_ptr_; - const float alpha_; - const float scale_; - const float la_; - T* dx_data_ptr_; -}; - -template -class SeluKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using Tensor = framework::Tensor; - - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - - float alpha = context.Attr("alpha"); - float scale = context.Attr("scale"); - - auto out_ptr = out->mutable_data(context.GetPlace()); - - SeluFunctor functor(x->data(), alpha, scale, out_ptr); - - auto& dev_ctx = context.template device_context(); - size_t limit = static_cast(x->numel()); - platform::ForRange for_range(dev_ctx, limit); - for_range(functor); - } -}; - -template -class SeluGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using Tensor = framework::Tensor; - - auto* out = context.Input("Out"); - auto* dout = context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - - float alpha = context.Attr("alpha"); - float scale = context.Attr("scale"); - - auto dx_ptr = dx->mutable_data(context.GetPlace()); - - SeluGradFunctor functor(out->data(), dout->data(), alpha, scale, - dx_ptr); - - auto& dev_ctx = context.template device_context(); - size_t limit = static_cast(out->numel()); - platform::ForRange for_range(dev_ctx, limit); - for_range(functor); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc index 6c33ff52044b26b598f835ee40462a01077c1ff8..23c6a0133e1edafba5621825db78a52b88e6947a 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc @@ -184,9 +184,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { col_data, paddle::platform::errors::Fatal("XPU memory is not enough")); if (in_g || filter_g) { - int r = xpu::constant(xpu_context, col_data, col_numel, T(0)); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); - bool trans_a = false; bool trans_b = true; int m = out_g->dims()[0]; @@ -208,7 +205,7 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { const T* data_b = filter->data(); T* data_c = col_data; - r = xpu::fc_fusion( + int r = xpu::fc_fusion( xpu_context, data_a, data_b, data_c, m, n, k, trans_a, trans_b, nullptr, nullptr, nullptr, lda, ldb, ldc, alpha, beta, nullptr, xpu::Activation_t::LINEAR); @@ -222,7 +219,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { in_g->mutable_data(context.GetPlace()); in_g->set_lod(in->lod()); - xpu::constant(xpu_context, in_g->data(), in_g->numel(), T(0)); int r = xpu::sequence_context_projection_grad( xpu_context, in_g->data(), col_data, nullptr, lodx, sequence_width, @@ -232,8 +228,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { if (filter_g) { filter_g->mutable_data(context.GetPlace()); - xpu::constant(xpu_context, filter_g->data(), filter_g->numel(), - T(0)); int r = xpu::sequence_context_projection( xpu_context, in->data(), col_data, nullptr, lodx, sequence_width, diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc index 2d4730635fd2aeb2e20aa5f4a637f94bce075566..25c12ab565a141f48d254d51bfca64f7422f1f42 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc @@ -16,8 +16,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" -#include "paddle/fluid/operators/scatter.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h index 365381abc4683580b9dffb94ace9876933de495b..2960b77d5ac0f81e4dd026d9de3448cac1459645 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h @@ -15,8 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" -#include "paddle/fluid/operators/scatter.h" +#include "paddle/phi/kernels/funcs/scatter.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc index ec3e04e71faf0b20950d87de1a7f066e2e49310a..7d0d782b837c4c828996e993634373ab38d88eac 100644 --- a/paddle/fluid/operators/set_value_op.cc +++ b/paddle/fluid/operators/set_value_op.cc @@ -241,13 +241,6 @@ REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker, ops::SetValueGradMaker, ops::SetValueOpInplaceInferer); -REGISTER_OP_CPU_KERNEL( - set_value, ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel); - REGISTER_OPERATOR(set_value_grad, ops::SetValueGrad); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/operators/set_value_op.cu b/paddle/fluid/operators/set_value_op.cu index f9701b0acaac769bd91bbba156a010c2e05e42c3..9f291a863c067ae0210f44befb89191678291441 100644 --- a/paddle/fluid/operators/set_value_op.cu +++ b/paddle/fluid/operators/set_value_op.cu @@ -16,13 +16,6 @@ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - set_value, ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel); - REGISTER_OP_CUDA_KERNEL( set_value_grad, ops::SetValueGradKernel, diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h index 9dd727959202c6b09bad0f07aa242a8897583342..4d459f8c01b159549c331f9332e49ed79e7c9b16 100644 --- a/paddle/fluid/operators/set_value_op.h +++ b/paddle/fluid/operators/set_value_op.h @@ -121,201 +121,6 @@ inline void CheckIsDimsMatch(const framework::DDim first, "of target shape: %d, but now shape is %d.", second.to_str(), first.to_str())); } - -template -class SetValueKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - const int rank = ctx.Input("Input")->dims().size(); - - // TODO(liym27): A more elegent code to do this. C++ has to make template - // integer as constant, but we had better have alternative writing in the - // future. - switch (rank) { - case 1: - SetValueCompute<1>(ctx); - break; - case 2: - SetValueCompute<2>(ctx); - break; - case 3: - SetValueCompute<3>(ctx); - break; - case 4: - SetValueCompute<4>(ctx); - break; - case 5: - SetValueCompute<5>(ctx); - break; - case 6: - SetValueCompute<6>(ctx); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "The rank of input should be less than 7, but received %d.", rank)); - } - } - - private: - template - void SetValueCompute(const framework::ExecutionContext& ctx) const { - auto* in = ctx.Input("Input"); - auto* value_tensor = ctx.Input("ValueTensor"); - auto* out = ctx.Output("Out"); - - auto starts_tensor_list = - ctx.MultiInput("StartsTensorList"); - auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); - auto steps_tensor_list = - ctx.MultiInput("StepsTensorList"); - - auto axes = ctx.Attr>("axes"); - auto starts = ctx.Attr>("starts"); - auto ends = ctx.Attr>("ends"); - auto steps = ctx.Attr>("steps"); - auto shape = ctx.Attr>("shape"); - auto decrease_axes = ctx.Attr>("decrease_axes"); - auto none_axes = ctx.Attr>("none_axes"); - - if (!starts_tensor_list.empty()) { - starts = GetDataFromTensorList(starts_tensor_list); - } - if (!ends_tensor_list.empty()) { - ends = GetDataFromTensorList(ends_tensor_list); - } - if (!steps_tensor_list.empty()) { - steps = GetDataFromTensorList(steps_tensor_list); - } - - auto in_dims = in->dims(); - CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps); - auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, &steps); - auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes); - - auto slice_dims_for_assign = decrease_slice_dims; - if (!none_axes.empty()) { - std::vector slice_dims_with_none; - - size_t none_axes_cur = 0, decrease_axes_cur = 0; - for (int i = 0; i < slice_dims.size(); ++i) { - while (none_axes_cur < none_axes.size() && - none_axes[none_axes_cur] <= i) { - slice_dims_with_none.push_back(1); - none_axes_cur++; - } - if (decrease_axes_cur < decrease_axes.size() && - decrease_axes[decrease_axes_cur] == i) { - decrease_axes_cur++; - } else { - slice_dims_with_none.push_back(slice_dims[i]); - } - } - while (none_axes_cur < none_axes.size()) { - slice_dims_with_none.push_back(1); - none_axes_cur++; - } - - slice_dims_for_assign = phi::make_ddim(slice_dims_with_none); - } - - auto place = ctx.GetPlace(); - auto& eigen_place = - *ctx.template device_context().eigen_device(); - - // Here copy data from input to avoid data loss at PE and Graph level. - // TODO(liym27): Speed up in the future version. - // - Q: Why don't call ShareDataWith to speed up? - // - A: Because it's not supported to ShareDataWith on OP's input and output - // https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-and-ShareBufferWith-are-prohibited-in-OP - // - Q: Why don't delete Input, after all, the input and output are the same - // Tensor at program level? - // - A: If deleting Input, the graph will be complex, such as there will - // be two ops points to the output in graph: op1 -> output <- set_value. - // In this case, we have to find a way to handle the running order of - // set_value is what we want. - paddle::framework::TensorCopy(*in, place, out); - - Tensor slice_tensor(in->dtype()), pad_tensor(in->dtype()); - slice_tensor.mutable_data(slice_dims, place); - pad_tensor.mutable_data(in_dims, place); - - auto pad_e = framework::EigenTensor::From(pad_tensor, in_dims); - auto out_e = framework::EigenTensor::From(*out); - auto slice_e = framework::EigenTensor::From(slice_tensor, slice_dims); - - // Step 1: Set the value of out at `_index` to zero - slice_e.device(eigen_place) = slice_e.constant(T(0)); - - auto starts_indices = Eigen::DSizes(); - auto ends_indices = Eigen::DSizes(); - auto strides_indices = Eigen::DSizes(); - - for (size_t i = 0; i < D; ++i) { - starts_indices[i] = 0; - ends_indices[i] = slice_dims[i]; - strides_indices[i] = 1; - } - for (size_t i = 0; i < axes.size(); i++) { - int axis_index = axes[i]; - starts_indices[axis_index] = starts[i]; - ends_indices[axis_index] = ends[i]; - strides_indices[axis_index] = steps[i]; - if (starts[i] == ends[i]) { // slice is empty, data will not be changed - return; - } - } - - out_e.stridedSlice(starts_indices, ends_indices, strides_indices) - .device(eigen_place) = slice_e; - - // Step 2: Set a tensor with the same shape as out tensor. And its data at - // '_index' is the same as value_tensor, and data out of '_index' to zero - - // - Step 2.1 Set slice tensor with value - - // NOTE(liym27): [ Why resize slice_tensor here? ] - // A: When do broadcasting on slice_tensor and value_tensor, the shape of - // slice_tensor should be decreased dims. - // e.g. - // x[:,0] = value_tensor - // x's shape = [3, 4], value_tensor's shape = [3] - // We get slice_dims = [3, 1], decrease_slice_dims = [3] - // If do broadcasting on Tensor with shape [3, 1] and [3], the result's - // shape is [3, 3], which cross the border; - // If do broadcasting on Tensor with shape [3] and [3], the result's shape - // is [3], which is right. - - slice_tensor.Resize(slice_dims_for_assign); - if (value_tensor != nullptr) { - CheckIsDimsMatch(slice_dims_for_assign, value_tensor->dims()); - // ElementwiseComputeEx can do broadcasting - ElementwiseComputeEx, DeviceContext, T>( - ctx, &slice_tensor, value_tensor, -1, SubFunctor(), &slice_tensor); - } else { - Tensor value_t(in->dtype()); - auto value_dims = phi::make_ddim(shape); - CheckIsDimsMatch(slice_dims_for_assign, value_dims); - - value_t.mutable_data(value_dims, place); - auto value_name = - GetValueName(framework::TransToProtoVarType(in->dtype())); - CopyVecotorToTensor(value_name.c_str(), &value_t, ctx); - value_t.Resize(value_dims); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &slice_tensor, &value_t, -1, SubFunctor(), &slice_tensor); - } - slice_tensor.Resize(slice_dims); - - // - Step 2.2 Pad slice tensor with 0 - pad_e.device(eigen_place) = pad_e.constant(T(0)); - pad_e.stridedSlice(starts_indices, ends_indices, strides_indices) - .device(eigen_place) = slice_e; - - // Step 3: Set out tensor with value_tensor - out_e.device(eigen_place) = out_e - pad_e; - } -}; - template class SetValueGradKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc index 599697059c4dcfa54fa728a8ebf88ad95f387774..46d64333b608b7f3e7b3d83664978d162b6d6e52 100644 --- a/paddle/fluid/operators/set_value_op_npu.cc +++ b/paddle/fluid/operators/set_value_op_npu.cc @@ -174,6 +174,9 @@ class SetValueNPUKernel : public framework::OpKernel { .AddInput(std::move(index_indices)) .AddInput(val_temp) .AddOutput(out_temp) +#if (CANN_VERSION_CODE >= 504001) + .AddAttrs({{"use_locking", false}}) +#endif .Run(stream); } }; diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc index 5b7ccdde81097a2cfd74c3d65c0679d277b766a3..e2c8359beb1290f7b1b592c1ff24b15986f41f73 100644 --- a/paddle/fluid/operators/shape_op.cc +++ b/paddle/fluid/operators/shape_op.cc @@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/shape_op.h" #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/complex.h" namespace paddle { namespace operators { @@ -95,9 +93,3 @@ REGISTER_OPERATOR( shape, ops::ShapeOp, ops::ShapeOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel, - ops::ShapeKernel>, - ops::ShapeKernel>); diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu deleted file mode 100644 index c6e380a94f84db7de53d0c218682813fcad0128d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/shape_op.cu +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/shape_op.h" -#include "paddle/fluid/platform/complex.h" - -REGISTER_OP_CUDA_KERNEL( - shape, paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel>, - paddle::operators::ShapeKernel>); diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h deleted file mode 100644 index 39ebcca46a710e0b817792105046af70b6298fc1..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/shape_op.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using SelectedRows = phi::SelectedRows; - -template -class ShapeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in_var = ctx.InputVar("Input"); - framework::DDim in_dims; - if (in_var->IsType()) { - in_dims = in_var->Get().value().dims(); - } else { - in_dims = in_var->Get().dims(); - } - auto* out_t = ctx.Output("Out"); - out_t->Resize({in_dims.size()}); - auto out_data = out_t->mutable_data(platform::CPUPlace()); - for (int i = 0; i < in_dims.size(); ++i) { - out_data[i] = in_dims[i]; - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc index 7bff7b2d668347692309d3695eb46b1fbdb6c7dd..f751ab41014c21fda2403bd69bcd20ad549e40c7 100644 --- a/paddle/fluid/operators/shape_op_npu.cc +++ b/paddle/fluid/operators/shape_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/shape_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/shape_op_xpu.cc b/paddle/fluid/operators/shape_op_xpu.cc index 2e9092a643253843ed09ab7475ec3ed723d5e3b8..a62d1b434e76434c3710e45e723060d3f452c91c 100644 --- a/paddle/fluid/operators/shape_op_xpu.cc +++ b/paddle/fluid/operators/shape_op_xpu.cc @@ -10,12 +10,41 @@ * limitations under the License. */ #ifdef PADDLE_WITH_XPU +#include +#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/shape_op.h" +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = phi::SelectedRows; + +template +class ShapeXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in_var = ctx.InputVar("Input"); + framework::DDim in_dims; + if (in_var->IsType()) { + in_dims = in_var->Get().value().dims(); + } else { + in_dims = in_var->Get().dims(); + } + auto* out_t = ctx.Output("Out"); + out_t->Resize({in_dims.size()}); + auto out_data = out_t->mutable_data(platform::CPUPlace()); + for (int i = 0; i < in_dims.size(); ++i) { + out_data[i] = in_dims[i]; + } + } +}; +} // namespace operators +} // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL(shape, ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel); +REGISTER_OP_XPU_KERNEL(shape, ops::ShapeXPUKernel, + ops::ShapeXPUKernel, ops::ShapeXPUKernel, + ops::ShapeXPUKernel, ops::ShapeXPUKernel); #endif diff --git a/paddle/fluid/operators/shard_index_op.cc b/paddle/fluid/operators/shard_index_op.cc index 54555e494ffe5f2c226c7aabd47b4ce991dab2ec..053a90f2fc9fa2f93c2647c420a046401198bc28 100644 --- a/paddle/fluid/operators/shard_index_op.cc +++ b/paddle/fluid/operators/shard_index_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/shard_index_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -20,27 +23,6 @@ namespace operators { class ShardIndexOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ShardIndex"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ShardIndex"); - - auto x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_GE(x_dims.size(), 2, - platform::errors::InvalidArgument( - "Rank of Input(X) should be at least 2, " - "but the value given is %d.", - x_dims.size())); - if (ctx->IsRuntime() || x_dims[x_dims.size() - 1] > 0) { - PADDLE_ENFORCE_EQ(x_dims[x_dims.size() - 1], 1U, - platform::errors::InvalidArgument( - "The last dimension of Input(X) should be 1, " - "but the value given is %d.", - x_dims[x_dims.size() - 1])); - } - - ctx->SetOutputDim("Out", x_dims); - ctx->ShareLoD("X", /* --> */ "Out"); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -114,7 +96,10 @@ Examples: } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(shard_index, ops::ShardIndexOp, - ops::ShardIndexOpMaker); -REGISTER_OP_CPU_KERNEL(shard_index, ops::ShardIndexCPUKernel, - ops::ShardIndexCPUKernel); +DECLARE_INFER_SHAPE_FUNCTOR(shard_index, ShardIndexInferShapeFunctor, + PD_INFER_META(phi::ShardIndexInferMeta)); +REGISTER_OPERATOR( + shard_index, ops::ShardIndexOp, ops::ShardIndexOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + ShardIndexInferShapeFunctor); diff --git a/paddle/fluid/operators/shard_index_op.cu b/paddle/fluid/operators/shard_index_op.cu deleted file mode 100644 index 115b3f47d664ba00228343d221d5be70d13a7ff1..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/shard_index_op.cu +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/shard_index_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using platform::PADDLE_CUDA_NUM_THREADS; - -template -__global__ void ShardIndexInner(const T* in_data, T* out_data, - const int64_t numel, const int index_num, - const int nshards, const int shard_id, - const int ignore_value) { - int shard_size = (index_num + nshards - 1) / nshards; - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < numel) { - assert(in_data[idx] >= 0 && in_data[idx] < index_num); - if (in_data[idx] / shard_size == shard_id) { - out_data[idx] = in_data[idx] % shard_size; - } else { - out_data[idx] = ignore_value; - } - } -} - -using LoDTensor = framework::LoDTensor; - -template -class ShardIndexCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - int index_num = context.Attr("index_num"); - int nshards = context.Attr("nshards"); - int shard_id = context.Attr("shard_id"); - int ignore_value = context.Attr("ignore_value"); - PADDLE_ENFORCE_GT( - index_num, 0, - platform::errors::InvalidArgument( - "The value 'index_num' for Op(shard_index) must be greater than 0, " - "but the value given is %d.", - index_num)); - PADDLE_ENFORCE_GT(nshards, 0, - platform::errors::InvalidArgument( - "The value 'nshard' for Op(shard_index) must be " - "greater than 0, but the value given is %d.", - nshards)); - PADDLE_ENFORCE_GE( - shard_id, 0, - platform::errors::InvalidArgument( - "The value 'shard_id' for Op(shard_index) must be greater or " - "equal to 0, but the value given is %d.", - shard_id)); - PADDLE_ENFORCE_LT( - shard_id, nshards, - platform::errors::InvalidArgument( - "The value 'shard_id' for Op(shard_index) must be less than " - "nshards (%d), but the value given is %d.", - nshards, shard_id)); - - out->Resize(in->dims()); - out->set_lod(in->lod()); - auto* in_data = in->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - int64_t numel = in->numel(); - auto stream = - context.template device_context().stream(); - ShardIndexInner<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - in_data, out_data, numel, index_num, nshards, shard_id, ignore_value); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(shard_index, ops::ShardIndexCUDAKernel, - ops::ShardIndexCUDAKernel); diff --git a/paddle/fluid/operators/shard_index_op.h b/paddle/fluid/operators/shard_index_op.h deleted file mode 100644 index c2fe3711686d4c4c802fadd66d4bc994232ef5ec..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/shard_index_op.h +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using LoDTensor = framework::LoDTensor; -template -class ShardIndexCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - int index_num = context.Attr("index_num"); - int nshards = context.Attr("nshards"); - int shard_id = context.Attr("shard_id"); - int ignore_value = context.Attr("ignore_value"); - PADDLE_ENFORCE_GT( - index_num, 0, - platform::errors::InvalidArgument( - "The value 'index_num' for Op(shard_index) must be greater than 0, " - "but the value given is %d.", - index_num)); - PADDLE_ENFORCE_GT(nshards, 0, - platform::errors::InvalidArgument( - "The value 'nshard' for Op(shard_index) must be " - "greater than 0, but the value given is %d.", - nshards)); - PADDLE_ENFORCE_GE( - shard_id, 0, - platform::errors::InvalidArgument( - "The value 'shard_id' for Op(shard_index) must be greater or " - "equal to 0, but the value given is %d.", - shard_id)); - PADDLE_ENFORCE_LT( - shard_id, nshards, - platform::errors::InvalidArgument( - "The value 'shard_id' for Op(shard_index) must be less than " - "nshards (%d), but the value given is %d.", - nshards, shard_id)); - - int shard_size = (index_num + nshards - 1) / nshards; - - out->Resize(in->dims()); - out->set_lod(in->lod()); - auto* in_data = in->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - int64_t numel = in->numel(); - for (int64_t i = 0; i < numel; ++i) { - PADDLE_ENFORCE_GE(in_data[i], 0, - platform::errors::InvalidArgument( - "The input_index for Op(shard_index) must be " - "greater or equal to 0, but the value given is %d.", - in_data[i])); - PADDLE_ENFORCE_LT(in_data[i], index_num, - platform::errors::InvalidArgument( - "The input_index for Op(shard_index) must be less " - "than index_num (%d), but the value given is %d.", - index_num, in_data[i])); - if (in_data[i] / shard_size == shard_id) { - out_data[i] = in_data[i] % shard_size; - } else { - out_data[i] = ignore_value; - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/shard_index_op_npu.cc b/paddle/fluid/operators/shard_index_op_npu.cc index dc2e8ad58f31ce8fe845ecb1f368544704e1d9ad..c875448424a24e686b9a6285725f801d604abc46 100644 --- a/paddle/fluid/operators/shard_index_op_npu.cc +++ b/paddle/fluid/operators/shard_index_op_npu.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/shard_index_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index a4e80343903d5a48dda584dc1f203782adb36787..016ff54645b02e9b3ddfb67595d830ccf5dcfd94 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -12,59 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { using framework::Tensor; +const int kIgnoreIndex = -100; class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", - "SigmoidCrossEntropyWithLogitsOp"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", - "SigmoidCrossEntropyWithLogitsOp"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", - "SigmoidCrossEntropyWithLogitsOp"); - - auto x_dims = ctx->GetInputDim("X"); - auto labels_dims = ctx->GetInputDim("Label"); - - int rank = x_dims.size(); - PADDLE_ENFORCE_EQ(rank, labels_dims.size(), - platform::errors::InvalidArgument( - "Input(X) and Input(Label) shall have the same rank." - "But received: the rank of Input(X) is [%d], " - "the rank of Input(Label) is [%d].", - rank, labels_dims.size())); - - bool check = true; - if ((!ctx->IsRuntime()) && - (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) { - check = false; - } - - if (check) { - PADDLE_ENFORCE_EQ( - phi::slice_ddim(x_dims, 0, rank), - phi::slice_ddim(labels_dims, 0, rank), - platform::errors::InvalidArgument( - "Input(X) and Input(Label) shall have the same shape " - "except the last dimension. But received: the shape of " - "Input(X) is [%s], the shape of Input(Label) is [%s].", - x_dims, labels_dims)); - } - - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class SigmoidCrossEntropyWithLogitsGradOp @@ -200,23 +164,17 @@ DECLARE_INPLACE_OP_INFERER(SigmoidCrossEntropyWithLogitsGradInplaceInferer, } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR( + sigmoid_cross_entropy_with_logits, + SigmoidCrossEntropyWithLogitsInferShapeFunctor, + PD_INFER_META(phi::SigmoidCrossEntropyWithLogitsInferMeta)); REGISTER_OPERATOR( sigmoid_cross_entropy_with_logits, ops::SigmoidCrossEntropyWithLogitsOp, ops::SigmoidCrossEntropyWithLogitsOpMaker, ops::SigmoidCrossEntropyWithLogitsGradOpMaker, ops::SigmoidCrossEntropyWithLogitsGradOpMaker, - ops::SigmoidCrossEntropyWithLogitsInplaceInferer); + ops::SigmoidCrossEntropyWithLogitsInplaceInferer, + SigmoidCrossEntropyWithLogitsInferShapeFunctor); REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad, ops::SigmoidCrossEntropyWithLogitsGradOp, ops::SigmoidCrossEntropyWithLogitsGradInplaceInferer); -REGISTER_OP_CPU_KERNEL( - sigmoid_cross_entropy_with_logits, - ops::SigmoidCrossEntropyWithLogitsKernel, - ops::SigmoidCrossEntropyWithLogitsKernel); -REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad, - ops::SigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CPUDeviceContext, float>, - ops::SigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CPUDeviceContext, double>); diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu deleted file mode 100644 index 40476d5e11f6a3b0cad21038a3f342d824f3575c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu +++ /dev/null @@ -1,193 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/operators/math.h" -#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -#ifdef __HIPCC__ -static constexpr int kNumCUDAThreads = 256; -#else -static constexpr int kNumCUDAThreads = 512; -#endif -static constexpr int kNumMaxinumNumBlocks = 4096; - -static inline int NumBlocks(const int N) { - return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, - kNumMaxinumNumBlocks); -} - -template -__global__ void GPUSigmoidForward(const T *x_data, const T *label_data, - const int ignore_index, const int limit, - T *out_data, T *counts) { - CUDA_KERNEL_LOOP(i, limit) { - T x = x_data[i]; - T label = label_data[i]; - T eps = static_cast(1e-5); - T diff = label - static_cast(ignore_index); - if ((diff > -eps) && (diff < eps)) { - out_data[i] = static_cast(0.); - counts[i] = 0; - } else { - T term1 = (x > 0) ? x : 0; - T term2 = x * label; - T term3 = real_log(static_cast(1) + real_exp(static_cast(-abs(x)))); - out_data[i] = term1 - term2 + term3; - counts[i] = 1; - } - } -} - -template -__global__ void Sum(const T *counts, int num, const T eps, T *sum) { - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - T in = 0; - for (int i = threadIdx.x; i < num; i += BlockDim) { - in += counts[i]; - } - __syncthreads(); - auto out = - BlockReduce(temp_storage).Reduce(static_cast(in), cub::Sum()); - __syncthreads(); - if (threadIdx.x == 0) { - T a = out > eps ? out : eps; - sum[0] = a; - } -} - -template -__global__ void Div(T *loss, const int num, const T *norm) { - CUDA_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; } -} - -template -__global__ void GPUSigmoidBackward(const T *x_data, const T *label_data, - const int ignore_index, const T *dout_data, - const int limit, T *dx_data, T *counts) { - CUDA_KERNEL_LOOP(i, limit) { - T x = x_data[i]; - T label = label_data[i]; - T dout = dout_data[i]; - T eps = static_cast(1e-5); - T diff = label - static_cast(ignore_index); - if ((diff > -eps) && (diff < eps)) { - dx_data[i] = static_cast(0.); - counts[i] = 0; - } else { - T simoid_x = static_cast(1) / (static_cast(1) + real_exp(-x)); - T diff = simoid_x - label; - dx_data[i] = dout * diff; - counts[i] = 1; - } - } -} - -// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) -template -class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const Tensor *X = context.Input("X"); - const Tensor *Labels = context.Input("Label"); - Tensor *Out = context.Output("Out"); - int ignore_index = context.Attr("ignore_index"); - auto out_data = Out->mutable_data(context.GetPlace()); - - auto &dev_ctx = context.cuda_device_context(); - bool normalize = context.Attr("normalize"); - - // Temporary memory - auto cnt_ptr = memory::Alloc(dev_ctx, Labels->numel() * sizeof(T)); - T *counts = reinterpret_cast(cnt_ptr->ptr()); - - int limit = Out->numel(); - int blocks = NumBlocks(limit); - int threads = kNumCUDAThreads; - GPUSigmoidForward<<>>( - X->data(), Labels->data(), ignore_index, limit, out_data, counts); - if (normalize) { - auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T)); - T *norm = reinterpret_cast(norm_ptr->ptr()); - Sum<<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>( - counts, limit, static_cast(1e-5), norm); - Div<<>>(out_data, limit, norm); - } - } -}; - -// dX = sigmoid(X) - labels -template -class GPUSigmoidCrossEntropyWithLogitsGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const Tensor *X = context.Input("X"); - const Tensor *Labels = context.Input("Label"); - const Tensor *dOut = context.Input(framework::GradVarName("Out")); - Tensor *dX = context.Output(framework::GradVarName("X")); - auto dx_data = dX->mutable_data(context.GetPlace()); - - int ignore_index = context.Attr("ignore_index"); - - auto &dev_ctx = context.cuda_device_context(); - // Temporary memory - auto cnt_ptr = memory::Alloc(dev_ctx, X->numel() * sizeof(T)); - T *counts = reinterpret_cast(cnt_ptr->ptr()); - - int limit = dX->numel(); - int blocks = NumBlocks(limit); - int threads = kNumCUDAThreads; - GPUSigmoidBackward<<>>( - X->data(), Labels->data(), ignore_index, dOut->data(), limit, - dx_data, counts); - bool normalize = context.Attr("normalize"); - if (normalize) { - auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T)); - T *norm = reinterpret_cast(norm_ptr->ptr()); - Sum<<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>( - counts, limit, static_cast(1e-5), norm); - Div<<>>(dx_data, limit, norm); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits, - ops::GPUSigmoidCrossEntropyWithLogitsKernel< - paddle::platform::CUDADeviceContext, float>, - ops::GPUSigmoidCrossEntropyWithLogitsKernel< - paddle::platform::CUDADeviceContext, double>); -REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad, - ops::GPUSigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CUDADeviceContext, float>, - ops::GPUSigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CUDADeviceContext, double>); diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h deleted file mode 100644 index d2ced490ceff474e1e7624c591a9d142b4199c2f..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h +++ /dev/null @@ -1,114 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -const int kIgnoreIndex = -100; - -// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) -template -class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const Tensor *X = context.Input("X"); - const Tensor *Labels = context.Input("Label"); - Tensor *Out = context.Output("Out"); - int ignore_index = context.Attr("ignore_index"); - auto out_data = Out->mutable_data(context.GetPlace()); - int limit = Out->numel(); - auto x_data = X->data(); - auto label_data = Labels->data(); - for (int idx = 0; idx < limit; ++idx) { - T x = x_data[idx]; - T label = label_data[idx]; - if (static_cast(label) == ignore_index) { - out_data[idx] = static_cast(0.); - } else { - T term1 = (x > 0) ? x : 0; - T term2 = x * label; - T term3 = std::log(static_cast(1) + std::exp(-std::abs(x))); - out_data[idx] = term1 - term2 + term3; - } - } - bool normalize = context.Attr("normalize"); - if (normalize) { - int norm = 0; - T eps = static_cast(1e-6); - for (int idx = 0; idx < limit; ++idx) { - T diff = label_data[idx] - static_cast(ignore_index); - if ((diff < -eps) || (diff > eps)) { - norm += 1; - } - } - eps = static_cast(1e-5); - norm = norm > eps ? norm : eps; - std::for_each(out_data, out_data + limit, [norm](T &v) { v = v / norm; }); - } - } -}; - -// dX = sigmoid(X) - labels -template -class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const Tensor *X = context.Input("X"); - const Tensor *Labels = context.Input("Label"); - const Tensor *dOut = context.Input(framework::GradVarName("Out")); - Tensor *dX = context.Output(framework::GradVarName("X")); - auto dx_data = dX->mutable_data(context.GetPlace()); - - int ignore_index = context.Attr("ignore_index"); - int limit = dX->numel(); - auto x_data = X->data(); - auto label_data = Labels->data(); - auto dout_data = dOut->data(); - for (int idx = 0; idx < limit; ++idx) { - T x = x_data[idx]; - T label = label_data[idx]; - T dout = dout_data[idx]; - if (static_cast(label) == ignore_index) { - dx_data[idx] = static_cast(0.); - } else { - T simoid_x = static_cast(1) / (static_cast(1) + std::exp(-x)); - T diff = simoid_x - label; - dx_data[idx] = dout * diff; - } - } - bool normalize = context.Attr("normalize"); - if (normalize) { - int norm = 0; - T eps = static_cast(1e-6); - for (int idx = 0; idx < limit; ++idx) { - T diff = label_data[idx] - static_cast(ignore_index); - if ((diff < -eps) || (diff > eps)) { - norm += 1; - } - } - eps = static_cast(1e-5); - norm = norm > eps ? norm : eps; - std::for_each(dx_data, dx_data + limit, [norm](T &v) { v = v / norm; }); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc index 40852425997f0b1a9cfa0c86180f2f2254efceec..f186f95a2b96117fa56fc17f70d4d0884214af87 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc @@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; +const int kIgnoreIndex = -100; void CheckAttrs(const framework::ExecutionContext& ctx) { // Add this check is is due to Ascend SigmoidCrossEntropyWithLogits diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc index 6395aa1caa01b9578d55e1155b0d6cd0d2295e36..c37731580d1212cb47c9e7f18aa4a9ba20af19d8 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc @@ -17,13 +17,15 @@ #include #include -#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class SigmoidCrossEntropyWithLogitsXPUKernel : public framework::OpKernel { using XPUType = typename XPUTypeTrait::Type; diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc index e2381c76f7e45a962fcacff079ca67df9610b6f1..ceb42dcf3e592182867a890bdfe73e237913ee53 100644 --- a/paddle/fluid/operators/sign_op.cc +++ b/paddle/fluid/operators/sign_op.cc @@ -60,8 +60,8 @@ class SignGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(sign, SignInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(sign, SignInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker, ops::SignGradMaker, ops::SignGradMaker, diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc index e584c1a4cce1e85344c574526098b034723c3059..84b0f403be03893810ef592db9b2c993cc6b9644 100644 --- a/paddle/fluid/operators/size_op.cc +++ b/paddle/fluid/operators/size_op.cc @@ -44,8 +44,8 @@ Return the number of elements in the input. } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(size, SizeInferShapeFunctor, - PT_INFER_META(phi::SizeInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(size, SizeInferShapeFunctor, + PD_INFER_META(phi::SizeInferMeta)); REGISTER_OPERATOR( size, ops::SizeOp, ops::SizeOpMaker, paddle::framework::EmptyGradOpMaker, diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc index 3bc55fafd81e18d0a986268ff4692129c6515edc..3148b31a8322e2bab39ad7f723ee59a6db64c204 100644 --- a/paddle/fluid/operators/softmax_op_npu_test.cc +++ b/paddle/fluid/operators/softmax_op_npu_test.cc @@ -22,7 +22,6 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc index 1cd6f8b7698b949a8e198c766fcf193e13481298..34650c2e06245532eda5ebcf9e8d8454ee93237b 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc @@ -37,7 +37,7 @@ class SoftmaxWithCrossEntropyMLUKernel : public framework::OpKernel { "the mlu kernel of softmax_with_cross_entropy.")); const int rank = logits->dims().size(); - const int axis = CanonicalAxis(ctx.Attr("axis"), rank); + const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); loss->mutable_data(ctx.GetPlace()); backprop->mutable_data(ctx.GetPlace()); @@ -45,10 +45,10 @@ class SoftmaxWithCrossEntropyMLUKernel : public framework::OpKernel { // cnnl softmax only support 3-dims, regard all shape as [d1, d2, d3] const int cnnl_softmax_dims = 3; - const int d1 = SizeToAxis(axis, logits->dims()); + const int d1 = phi::funcs::SizeToAxis(axis, logits->dims()); const int d2_logits = logits->dims()[axis]; const int d2_labels = labels->dims()[axis]; - const int d3 = SizeOutAxis(axis, logits->dims()); + const int d3 = phi::funcs::SizeOutAxis(axis, logits->dims()); // CNNL_SOFTMAX_MODE_LOW_DIMENSION has better perfermence, use it as much as // possible. diff --git a/paddle/fluid/operators/spectral_op.h b/paddle/fluid/operators/spectral_op.h index 2bc5124843c38152d2f5d3ffcef5a5ca24534bfd..a60ec5a4df52b8275a17185a63c8a7d27dd8132b 100644 --- a/paddle/fluid/operators/spectral_op.h +++ b/paddle/fluid/operators/spectral_op.h @@ -23,9 +23,9 @@ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/conj_op.h" #include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/fluid/operators/math/padding.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/padding.h" #if defined(__NVCC__) || defined(__HIPCC__) #include "thrust/device_vector.h" #endif @@ -389,8 +389,9 @@ class FFTR2CGradKernel : public framework::OpKernel { std::vector pads(rank * 2, 0); pads[axes.back() * 2 + 1] = zero_length; - paddle::operators::math::PaddingFunctor( - rank, ctx, pads, static_cast(0), *dy, &full_dy); + phi::funcs::PaddingFunctor( + rank, ctx.template device_context(), pads, + static_cast(0), *dy, &full_dy); fft_c2c_func(dev_ctx, &full_dy, &complex_dx, axes, normalization, !forward); } diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc index a8f05d94563e57a20cc41ba1edd68872d869d00e..5b8922505cc089d66f0b444fc65ccec8ed051876 100644 --- a/paddle/fluid/operators/split_op.cc +++ b/paddle/fluid/operators/split_op.cc @@ -15,6 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/split_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { using framework::Tensor; diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc index 956544c53609eb29326dc5cf295d978d767ac176..d61f5aa3f634cd2aee1e5c2f34f4467b1697e455 100644 --- a/paddle/fluid/operators/squeeze_op_npu_test.cc +++ b/paddle/fluid/operators/squeeze_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index c92d468f3462c92cd0631383996012afb6edb46b..af29aac6b9052877283271abc12f4dc1da6b8a3e 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -109,6 +109,11 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, auto& npu_ctx = reinterpret_cast(ctx); memory::Copy(npu_place, dst + i * dst_after, npu_place, src + i * src_after, sizeof(T) * size, npu_ctx.stream()); +#elif defined(PADDLE_WITH_MLU) + auto& mlu_place = place; + auto& mlu_ctx = reinterpret_cast(ctx); + memory::Copy(mlu_place, dst + i * dst_after, mlu_place, + src + i * src_after, sizeof(T) * size, mlu_ctx.stream()); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "Paddle is not compiled with GPU.")); diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu index 3e2d2a5495b3428ce0fad9d61431d53b44eea330..33590c1d7cca04e215e55abb26fb2aa3c3b61bec 100644 --- a/paddle/fluid/operators/sum_op.cu +++ b/paddle/fluid/operators/sum_op.cu @@ -258,4 +258,5 @@ REGISTER_OP_CUDA_KERNEL( ops::SumKernel, ops::SumKernel, ops::SumKernel, - ops::SumKernel); + ops::SumKernel, + ops::SumKernel); diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h index bcb3ee44f04657f1afcb9e85dbc01fde71562c39..166f49999d552917021a545b2799ae33ff257f06 100644 --- a/paddle/fluid/operators/svd_helper.h +++ b/paddle/fluid/operators/svd_helper.h @@ -105,7 +105,7 @@ struct RealMulComplexFunctor { "The image part of y must to be 0" "but got [%d]", y.imag)); - return platform::complex>(x.real * y.real, + return platform::complex>(x.real * y.real, x.imag * y.real); } }; @@ -391,11 +391,11 @@ struct DeviceIndependenceTensorOperations { // batch_diag for CPU only Tensor BatchDiag(const Tensor& x, int batch) { Tensor out; - auto* x_data = x.data>(); + auto* x_data = x.data>(); auto numel = x.numel(); - auto* out_data = out.mutable_data>( + auto* out_data = out.mutable_data>( x.dims(), context.GetPlace(), - static_cast(numel * sizeof(phi::funcs::Real))); + static_cast(numel * sizeof(phi::dtype::Real))); auto x_dims = x.dims(); int num_dims = x_dims.size(); @@ -661,9 +661,9 @@ struct DeviceIndependenceTensorOperations { Tensor Real(const Tensor& x) { Tensor out; auto numel = x.numel(); - auto* out_data = out.mutable_data>( + auto* out_data = out.mutable_data>( x.dims(), context.GetPlace(), - static_cast(numel * sizeof(phi::funcs::Real))); + static_cast(numel * sizeof(phi::dtype::Real))); auto* x_data = x.data(); auto for_range = GetForRange(numel); phi::funcs::RealFunctor functor(x_data, out_data, numel); diff --git a/paddle/fluid/operators/svd_op.h b/paddle/fluid/operators/svd_op.h index f5e451ac7054d15919170f06f4fd225a2243f5d1..42a847206a3cb6fecc421effa9e9d10bacc80be4 100644 --- a/paddle/fluid/operators/svd_op.h +++ b/paddle/fluid/operators/svd_op.h @@ -46,14 +46,14 @@ class SvdCPUKernel : public framework::OpKernel { int col_u = full ? rows : k; int col_v = full ? cols : k; int batches = numel / (rows * cols); - auto* U_out = U->mutable_data>( + auto* U_out = U->mutable_data>( context.GetPlace(), - size_t(batches * rows * col_u * sizeof(phi::funcs::Real))); - auto* VH_out = VH->mutable_data>( + size_t(batches * rows * col_u * sizeof(phi::dtype::Real))); + auto* VH_out = VH->mutable_data>( context.GetPlace(), - size_t(batches * col_v * cols * sizeof(phi::funcs::Real))); - auto* S_out = S->mutable_data>( - context.GetPlace(), size_t(batches * k * sizeof(phi::funcs::Real))); + size_t(batches * col_v * cols * sizeof(phi::dtype::Real))); + auto* S_out = S->mutable_data>( + context.GetPlace(), size_t(batches * k * sizeof(phi::dtype::Real))); /*SVD Use the Eigen Library*/ math::BatchSvd(x_data, U_out, VH_out, S_out, rows, cols, batches, full); } diff --git a/paddle/fluid/operators/take_along_axis_op.cc b/paddle/fluid/operators/take_along_axis_op.cc index 664f1031915e4661769d9b2844c5388f0efa91c0..fa8a5e92712ec86a01ca01b7eb644e289c03000a 100644 --- a/paddle/fluid/operators/take_along_axis_op.cc +++ b/paddle/fluid/operators/take_along_axis_op.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/take_along_axis_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" @@ -139,16 +140,3 @@ REGISTER_OPERATOR(take_along_axis, ops::TakeAlongAxisOp, ops::TakeAlongAxisGradOpMaker); REGISTER_OPERATOR(take_along_axis_grad, ops::TakeAlongAxisGradOp); - -REGISTER_OP_CPU_KERNEL(take_along_axis, ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel); - -REGISTER_OP_CPU_KERNEL(take_along_axis_grad, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel); diff --git a/paddle/fluid/operators/take_along_axis_op.cu b/paddle/fluid/operators/take_along_axis_op.cu deleted file mode 100644 index b6c62d497b379dda568f661b31366914e6870a7c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/take_along_axis_op.cu +++ /dev/null @@ -1,97 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/take_along_axis_op.h" -#include "paddle/phi/core/ddim.h" - -namespace paddle { -namespace operators { - -template -class TakeAlongAxisCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto index = ctx.Input("Index"); - auto result = ctx.Output("Result"); - result->Resize(index->dims()); - result->mutable_data(ctx.GetPlace()); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - gpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } - } -}; - -template -class TakeAlongAxisGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on GPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - // We need to know the shape of input matrix to determine the shape of grad - // matrix of input. - auto input = ctx.Input("Input"); - input_grad->Resize(input->dims()); - input_grad->mutable_data(ctx.GetPlace()); - - // Set to zero tensor. - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant functor; - functor(reinterpret_cast(dev_ctx), - input_grad, static_cast(0)); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_add_kernel( - *input_grad, axis, *index, *result_grad, - ctx.device_context()); // the gradient of gather is scatter - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_add_kernel(*input_grad, axis, *index, - *result_grad, ctx.device_context()); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(take_along_axis, ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel); -REGISTER_OP_CUDA_KERNEL(take_along_axis_grad, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel); diff --git a/paddle/fluid/operators/take_along_axis_op.h b/paddle/fluid/operators/take_along_axis_op.h deleted file mode 100644 index fc781dbddf2ad25de3728e76d231d0164d46c08e..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/take_along_axis_op.h +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather_scatter_kernel.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class TakeAlongAxisOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto index = ctx.Input("Index"); - auto result = ctx.Output("Result"); - result->Resize(index->dims()); - result->mutable_data(ctx.GetPlace()); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - cpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } - } -}; - -template -class TakeAlongAxisGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - // We need to know the shape of input matrix to determine the shape of grad - // matrix of input. - auto input = ctx.Input("Input"); - input_grad->Resize(input->dims()); - input_grad->mutable_data(ctx.GetPlace()); - - // Set to zero tensor. - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant functor; - functor(reinterpret_cast(dev_ctx), - input_grad, static_cast(0)); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_add_kernel( - *input_grad, axis, *index, *result_grad, - ctx.device_context()); // the gradient of gather is scatter - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_add_kernel(*input_grad, axis, *index, - *result_grad, ctx.device_context()); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index e05b4de65214c8cf55d099fccc7c18370b2312b7..0a71875d8931ef80846aa7e0c95ce1beab86fd7c 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -79,6 +79,28 @@ static void RuntimeStaticShapeCheck(std::vector runtime_input_shape, model_input_shape_str, runtime_input_shape_str)); } +static paddle::experimental::DataType TRT2FluidDataType( + nvinfer1::DataType type) { + switch (type) { + case nvinfer1::DataType::kFLOAT: + return paddle::experimental::DataType::FLOAT32; + case nvinfer1::DataType::kINT32: + return paddle::experimental::DataType::INT32; + case nvinfer1::DataType::kHALF: + return paddle::experimental::DataType::FLOAT16; + case nvinfer1::DataType::kINT8: + return paddle::experimental::DataType::INT8; +#if IS_TRT_VERSION_GE(7000) + case nvinfer1::DataType::kBOOL: + return paddle::experimental::DataType::BOOL; +#endif + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "unknown fluid datatype in Fluid op converter")); + return paddle::experimental::DataType::FLOAT32; + } +} + static void RuntimeDynamicShapeCheck( const std::string &x, const std::vector &runtime_input_shape, const std::vector &min_input_shape, @@ -520,9 +542,12 @@ class TensorRTEngineOp : public framework::OperatorBase { buffers[bind_index] = static_cast(t.data()); } else if (type == framework::proto::VarType::INT32) { buffers[bind_index] = static_cast(t.data()); + } else if (type == framework::proto::VarType::FP16) { + buffers[bind_index] = static_cast(t.data()); } else { - PADDLE_THROW(platform::errors::Fatal( - "The TRT Engine OP only support float/int32_t/int64_t input.")); + PADDLE_THROW( + platform::errors::Fatal("The TRT Engine OP only support " + "float/int32_t/int64_t/float16 input.")); } } @@ -570,9 +595,10 @@ class TensorRTEngineOp : public framework::OperatorBase { "than the number of bindings, but got binding " "index = %d, number of bindings = %d.", bind_index, num_bindings)); - buffers[bind_index] = - static_cast(fluid_t->mutable_data(dev_place)); - + auto trt_type = engine->engine()->getBindingDataType(bind_index); + // get adr and set type + buffers[bind_index] = static_cast( + fluid_t->mutable_data(dev_place, TRT2FluidDataType(trt_type))); output_index += 1; } diff --git a/paddle/fluid/operators/test_common_infer_shape_functions.cc b/paddle/fluid/operators/test_common_infer_shape_functions.cc index a7c7e33f58af6ce8f59a301d1fc5ccdf511b608f..1de1b590a1311b81f16ba05e746402e1fc14c556 100644 --- a/paddle/fluid/operators/test_common_infer_shape_functions.cc +++ b/paddle/fluid/operators/test_common_infer_shape_functions.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/operators/common_infer_shape_functions.h" #include "paddle/phi/core/ddim.h" -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(softmax); diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc index dc12f8e8892a022c6f55f4fe3a6237a7a01fa290..e179149c5bb77bd642f744be48109a941c66febf 100644 --- a/paddle/fluid/operators/tile_op.cc +++ b/paddle/fluid/operators/tile_op.cc @@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/tile_op.h" #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { @@ -26,66 +30,6 @@ class TileOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Tile"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Tile"); - auto x_dims = ctx->GetInputDim("X"); - auto repeat_times = ctx->Attrs().Get>("repeat_times"); - if (repeat_times.size() == 0) { - repeat_times = std::vector(x_dims.size(), -1); - } - - PADDLE_ENFORCE_LE( - x_dims.size(), MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'x' for tile op " - "must not be greater than %d, but the value received is %d.", - MAX_RANK_SUPPORTED, x_dims.size())); - PADDLE_ENFORCE_LE( - repeat_times.size(), MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The size of the shape of input 'repeat_times' for tile op " - "must not be greater than %d, but the value received is %d.", - MAX_RANK_SUPPORTED, repeat_times.size())); - PADDLE_ENFORCE_GE( - repeat_times.size(), 1, - platform::errors::InvalidArgument( - "The size of the shape of input 'repeat_times' for tile op " - "must be positive integers, but the value received is %d.", - repeat_times.size())); - - auto out_rank = - std::max(static_cast(x_dims.size()), repeat_times.size()); - std::vector out_shape(out_rank); - auto x_dim_vec = phi::vectorize(x_dims); - if (x_dim_vec.size() > repeat_times.size()) { - auto diff = x_dim_vec.size() - repeat_times.size(); - repeat_times.insert(repeat_times.begin(), diff, -1); - } else { - auto diff = repeat_times.size() - x_dim_vec.size(); - x_dim_vec.insert(x_dim_vec.begin(), diff, -1); - } - for (size_t i = 0; i < repeat_times.size(); ++i) { - if (x_dim_vec[i] == -1 || repeat_times[i] == -1) { - out_shape[i] = -1; - } else { - PADDLE_ENFORCE_GT( - repeat_times[i], 0, - platform::errors::InvalidArgument( - "Every element of the input 'repeat_times' for tile op must be " - "greater than 0, but the value given is %d.", - repeat_times[i])); - out_shape[i] = x_dim_vec[i] * repeat_times[i]; - } - } - - ctx->SetOutputDim("Out", phi::make_ddim(out_shape)); - if (out_shape[0] == x_dims[0]) { - ctx->ShareLoD("X", "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -268,38 +212,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(TileGradNoNeedBufVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(tile, TileInferMetaFunctor, + PD_INFER_META(phi::TileInferMeta)); + REGISTER_OPERATOR(tile, ops::TileOp, ops::TileOpMaker, ops::TileGradOpMaker, - ops::TileGradOpMaker); + ops::TileGradOpMaker, + TileInferMetaFunctor); REGISTER_OPERATOR(tile_grad, ops::TileGradOp, ops::TileDoubleGradOpMaker, ops::TileDoubleGradOpMaker, ops::TileGradNoNeedBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - tile, ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel); -REGISTER_OP_CPU_KERNEL( - tile_grad, ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -REGISTER_OP_CUDA_KERNEL( - tile, ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel); -REGISTER_OP_CUDA_KERNEL( - tile_grad, ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel); -#endif diff --git a/paddle/fluid/operators/tile_op.h b/paddle/fluid/operators/tile_op.h deleted file mode 100644 index 1698b5e3c6322e2cd9cbe7cf4839e2fc08627b32..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/tile_op.h +++ /dev/null @@ -1,306 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" - -#define MAX_RANK_SUPPORTED 6 - -namespace paddle { -namespace operators { -inline std::vector get_repeat_times( - const framework::ExecutionContext& ctx) { - if (ctx.HasInput("RepeatTimes")) { - auto* repeat_tensor = ctx.Input("RepeatTimes"); - auto* repeat_data = repeat_tensor->data(); - framework::Tensor cpu_repeat_tensor; - if (platform::is_gpu_place(repeat_tensor->place()) || - platform::is_xpu_place(repeat_tensor->place()) || - platform::is_npu_place(repeat_tensor->place())) { - paddle::framework::TensorCopySync(*repeat_tensor, platform::CPUPlace(), - &cpu_repeat_tensor); - repeat_data = cpu_repeat_tensor.data(); - } - auto vec_repeat_times = - std::vector(repeat_data, repeat_data + repeat_tensor->numel()); - return vec_repeat_times; - } - - auto list_repeat_times_tensor = - ctx.MultiInput("repeat_times_tensor"); - if (list_repeat_times_tensor.size() > 0) { - // get tensor from - std::vector vec_repeat_times; - for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) { - auto tensor = list_repeat_times_tensor[i]; - if (platform::is_gpu_place(tensor->place()) || - platform::is_xpu_place(tensor->place()) || - platform::is_npu_place(tensor->place())) { - framework::Tensor temp; - paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); - vec_repeat_times.push_back(*temp.data()); - } else { - vec_repeat_times.push_back(*tensor->data()); - } - } - return vec_repeat_times; - } else { - return ctx.Attr>("repeat_times"); - } -} - -using Tensor = framework::Tensor; -template -using EigenVector = framework::EigenVector; -template -using EigenTensor = framework::EigenTensor; -using framework::To32BitIndex; - -template -class TileKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto rank = context.Input("X")->dims().size(); - PADDLE_ENFORCE_GE( - rank, 1, platform::errors::InvalidArgument( - "The rank of the input 'x' for tile op must be a positive " - "integer, but the value received is %d.", - rank)); - PADDLE_ENFORCE_LE( - rank, MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'x' for tile op " - "must be less than or equal to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, rank)); - auto repeat_times = get_repeat_times(context); - int repeat_times_size = repeat_times.size(); - PADDLE_ENFORCE_GE( - repeat_times_size, 1, - platform::errors::InvalidArgument( - "The number of elements of the input 'repeat_times' for tile " - "op must be positive, but the value received is %d.", - repeat_times_size)); - PADDLE_ENFORCE_LE( - repeat_times_size, MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The number of elements of the input 'repeat_times' for tile op " - "must be less than or equal to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, repeat_times_size)); - rank = std::max(rank, repeat_times_size); - switch (rank) { - case 1: - Tile<1>(context); - break; - case 2: - Tile<2>(context); - break; - case 3: - Tile<3>(context); - break; - case 4: - Tile<4>(context); - break; - case 5: - Tile<5>(context); - break; - case 6: - Tile<6>(context); - break; - } - } - - protected: - template - void Tile(const framework::ExecutionContext& context) const { - auto* in0 = context.Input("X"); - - auto in_dims = in0->dims(); - auto repeat_times = get_repeat_times(context); - for (size_t i = 0; i < repeat_times.size(); ++i) { - PADDLE_ENFORCE_GT( - repeat_times[i], 0, - platform::errors::InvalidArgument( - "All elements of the input 'repeat_times' for tile op must " - "be positive integers, but the value received is %d.", - repeat_times[i])); - } - auto vec_in_dims = phi::vectorize(in_dims); - if (repeat_times.size() < vec_in_dims.size()) { - int diff = vec_in_dims.size() - repeat_times.size(); - repeat_times.insert(repeat_times.begin(), diff, 1); - } else { - int diff = repeat_times.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - } - PADDLE_ENFORCE_EQ( - repeat_times.size(), vec_in_dims.size(), - platform::errors::InvalidArgument( - "The rank (%d) of the input 'x' and the rank (%d) of the input " - "'repeat_times' for tile op must match after promotion.", - vec_in_dims.size(), repeat_times.size())); - auto* out0 = context.Output("Out"); - Eigen::DSizes bcast_dims; - for (size_t i = 0; i < repeat_times.size(); ++i) { - bcast_dims[i] = repeat_times[i]; - } - - framework::DDim new_in_dims = phi::make_ddim(vec_in_dims); - framework::DDim out_dims(new_in_dims); - for (size_t i = 0; i < repeat_times.size(); ++i) { - out_dims[i] *= repeat_times[i]; - } - - out0->Resize(out_dims); - auto x = EigenTensor::From(*in0, new_in_dims); - out0->mutable_data(context.GetPlace()); - auto y = EigenTensor::From(*out0, out_dims); - auto& place = - *context.template device_context().eigen_device(); - // use 32-bit index to speed up - bool use_32bit_index = y.size() < Eigen::NumTraits::highest(); - if (use_32bit_index) { - EigenBroadcast, T, Rank>::Eval( - place, To32BitIndex(y), To32BitIndex(x), bcast_dims); - } else { - EigenBroadcast, T, Rank>::Eval(place, y, x, - bcast_dims); - } - } -}; - -template -class TileGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto repeat_times = get_repeat_times(context); - auto x_dims = x->dims(); - auto vec_in_dims = phi::vectorize(x_dims); - if (repeat_times.size() < vec_in_dims.size()) { - int diff = vec_in_dims.size() - repeat_times.size(); - repeat_times.insert(repeat_times.begin(), diff, 1); - } else { - int diff = repeat_times.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - } - // 1. reshape_dims_vec is the broadcast parameter. - // 2. reduce_dims_vec is the dimension parameter to compute gradients. For - // each dimension expanded, the gradients should be summed to original - // size. - std::vector reshape_dims_vec; - std::vector reduce_dims_vec; - for (size_t i = 0; i < repeat_times.size(); ++i) { - reduce_dims_vec.push_back(reshape_dims_vec.size()); - reshape_dims_vec.push_back(repeat_times[i]); - reshape_dims_vec.push_back(vec_in_dims[i]); - } - - int dims = reduce_dims_vec.size(); - - bool just_copy = true; - for (size_t i = 0; i < repeat_times.size(); i++) { - if (repeat_times[i] != 1) { - just_copy = false; - break; - } - } - // no need reduce, just copy - if (just_copy) { - auto* dout = context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - dx->mutable_data(context.GetPlace()); - framework::TensorCopy(*dout, context.GetPlace(), context.device_context(), - dx); - // TensorCopy may change the dims of dx - dx->Resize(x_dims); - } else { - PADDLE_ENFORCE_GE(dims, 1, - platform::errors::InvalidArgument( - "Th rank of the input 'Out@GRAD' for tile_grad op " - " must be greater than or equal to 1, but " - "the value received is %d.", - dims)); - PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'Out@GRAD' for tile_grad op " - "must be less than or equal " - "to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, dims)); - switch (dims) { - case 1: - TileBackward<1>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 2: - TileBackward<2>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 3: - TileBackward<3>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 4: - TileBackward<4>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 5: - TileBackward<5>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 6: - TileBackward<6>(context, reshape_dims_vec, reduce_dims_vec); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Only support tensor with rank being between 1 and 6. But " - "received tensor's rank = %d.", - dims)); - } - } - } - - protected: - template - void TileBackward(const framework::ExecutionContext& context, - const std::vector& reshape_dims_vec, - const std::vector& reduce_dims_vec) const { - size_t reshape_size = reshape_dims_vec.size(); - size_t reduce_size = reduce_dims_vec.size(); - auto* in0 = context.Input(framework::GradVarName("Out")); - auto* out0 = context.Output(framework::GradVarName("X")); - out0->mutable_data(context.GetPlace()); - auto x_grad = EigenVector::Flatten(*out0); - Eigen::DSizes reshape_dims; - for (size_t i = 0; i < reshape_size; ++i) { - reshape_dims[i] = reshape_dims_vec[i]; - } - Eigen::DSizes reduce_dims; - for (size_t i = 0; i < reduce_size; ++i) { - reduce_dims[i] = reduce_dims_vec[i]; - } - - auto out_grad = EigenVector::Flatten(*in0); - auto& place = - *context.template device_context().eigen_device(); - EigenBroadcastGrad, T, Dims>::Eval( - place, x_grad, out_grad, reduce_dims, reshape_dims); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/tile_op_functor.h b/paddle/fluid/operators/tile_op_functor.h new file mode 100644 index 0000000000000000000000000000000000000000..95bfb9f4e1a9d374c66997567f5d80df8b5d8701 --- /dev/null +++ b/paddle/fluid/operators/tile_op_functor.h @@ -0,0 +1,67 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/fluid/framework/operator.h" + +#define MAX_RANK_SUPPORTED 6 + +namespace paddle { +namespace operators { + +inline std::vector get_repeat_times( + const framework::ExecutionContext& ctx) { + if (ctx.HasInput("RepeatTimes")) { + auto* repeat_tensor = ctx.Input("RepeatTimes"); + auto* repeat_data = repeat_tensor->data(); + framework::Tensor cpu_repeat_tensor; + if (platform::is_gpu_place(repeat_tensor->place()) || + platform::is_xpu_place(repeat_tensor->place()) || + platform::is_npu_place(repeat_tensor->place())) { + paddle::framework::TensorCopySync(*repeat_tensor, platform::CPUPlace(), + &cpu_repeat_tensor); + repeat_data = cpu_repeat_tensor.data(); + } + auto vec_repeat_times = + std::vector(repeat_data, repeat_data + repeat_tensor->numel()); + return vec_repeat_times; + } + + auto list_repeat_times_tensor = + ctx.MultiInput("repeat_times_tensor"); + if (list_repeat_times_tensor.size() > 0) { + // get tensor from + std::vector vec_repeat_times; + for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) { + auto tensor = list_repeat_times_tensor[i]; + if (platform::is_gpu_place(tensor->place()) || + platform::is_xpu_place(tensor->place()) || + platform::is_npu_place(tensor->place())) { + framework::Tensor temp; + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); + vec_repeat_times.push_back(*temp.data()); + } else { + vec_repeat_times.push_back(*tensor->data()); + } + } + return vec_repeat_times; + } else { + return ctx.Attr>("repeat_times"); + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc index 9e306c7be537bc7403812f4907541e1a9671c12a..cea6b458aec782923722cb37fe41c1c4d59292e5 100644 --- a/paddle/fluid/operators/tile_op_npu.cc +++ b/paddle/fluid/operators/tile_op_npu.cc @@ -11,7 +11,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/tile_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/tile_op_functor.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/tile_op_xpu.cc b/paddle/fluid/operators/tile_op_xpu.cc index 6b60b167a2465fcb03d8ec088cfa288f9fb14af1..598377587d6f73e0c21abbc4d3819d16eacb1f23 100644 --- a/paddle/fluid/operators/tile_op_xpu.cc +++ b/paddle/fluid/operators/tile_op_xpu.cc @@ -11,11 +11,14 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/tile_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/tile_op_functor.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class TileXPUKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h index d60976928e00cb5ecfde6ca65e0a1b0d5b1ef938..80c9935057cb5d5809fde545bdd0772afdaf2702 100644 --- a/paddle/fluid/operators/top_k_function_cuda.h +++ b/paddle/fluid/operators/top_k_function_cuda.h @@ -51,6 +51,19 @@ namespace operators { using Tensor = framework::Tensor; +inline void GetDims(const phi::DDim& dim, int axis, int* pre, int* n, + int* post) { + *pre = 1; + *post = 1; + *n = dim[axis]; + for (int i = 0; i < axis; ++i) { + (*pre) *= dim[i]; + } + for (int i = axis + 1; i < dim.size(); ++i) { + (*post) *= dim[i]; + } +} + struct SegmentOffsetIter { EIGEN_DEVICE_FUNC explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {} diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc index 810afc901df57bfa3c518b2363fb9153ee353762..d1add111e1d24cb711955a9aff06eb19feb35dc9 100644 --- a/paddle/fluid/operators/top_k_v2_op.cc +++ b/paddle/fluid/operators/top_k_v2_op.cc @@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/top_k_v2_op.h" #include +#include "paddle/fluid/framework/op_registry.h" + namespace paddle { namespace operators { @@ -173,15 +174,3 @@ REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker, ops::TopkV2GradOpMaker); REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad); - -REGISTER_OP_CPU_KERNEL(top_k_v2, - ops::TopkV2Kernel, - ops::TopkV2Kernel, - ops::TopkV2Kernel, - ops::TopkV2Kernel) - -REGISTER_OP_CPU_KERNEL( - top_k_v2_grad, ops::TopkV2GradKernel, - ops::TopkV2GradKernel, - ops::TopkV2GradKernel, - ops::TopkV2GradKernel) diff --git a/paddle/fluid/operators/top_k_v2_op.cu b/paddle/fluid/operators/top_k_v2_op.cu deleted file mode 100644 index 84d8eef53bf72c5dbd5404a889925541374c9823..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/top_k_v2_op.cu +++ /dev/null @@ -1,296 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/top_k_function_cuda.h" -#include "paddle/fluid/operators/top_k_v2_op.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -#define FIXED_BLOCK_DIM_BASE(dim, ...) \ - case (dim): { \ - constexpr auto kBlockDim = (dim); \ - __VA_ARGS__; \ - } break - -#define FIXED_BLOCK_DIM(...) \ - FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__) - -template -class TopkV2OpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - - // get the attributes - int k = static_cast(ctx.Attr("k")); - int axis = static_cast(ctx.Attr("axis")); - const bool& sorted = static_cast(ctx.Attr("sorted")); - const bool& largest = static_cast(ctx.Attr("largest")); - - // get the input dims - const auto& in_dims = input->dims(); - // calcluate the real axis - if (axis < 0) axis += in_dims.size(); - - auto* k_t = ctx.Input("K"); - if (k_t) { - Tensor k_host; - framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host); - k = k_host.data()[0]; - framework::DDim output_dims = output->dims(); - output_dims[axis] = k; - output->Resize(output_dims); - indices->Resize(output_dims); - } - - const auto& out_dims = output->dims(); - - const T* input_data = input->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); - int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); - - if (axis == in_dims.size() - 1) { - // if get the topK from the last axis - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - const auto& dev_ctx = ctx.cuda_device_context(); - - if (k > input_width) k = input_width; - - // The conclusion is drawn from the data through multiple sets of - // statistics - if (input_width >= 128 && k >= input_width * 0.75) { - if (SortTopk(dev_ctx, input, input_width, input_height, k, output, - indices, largest)) { - // Successed, return. - return; - } else { - LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use " - "default topk kernel."; - } - } - - // NOTE: pass lds and dim same to input width. - // NOTE: old matrix implementation of stride is different to eigen. - const int kMaxHeight = 2048; - int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; - switch (GetDesiredBlockDim(input_width)) { -#ifdef PADDLE_WITH_HIP - FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( - output_data, k, indices_data, input_data, input_width, - input_width, static_cast(k), gridx, input_height, - largest)); -#else - FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( - output_data, k, indices_data, input_data, input_width, - input_width, static_cast(k), gridx, input_height, - largest)); -#endif - default: - PADDLE_THROW(platform::errors::Fatal( - "the input data shape has error in the topk cuda kernel.")); - } - } else { - // if get topK not from the last axis, will tranpose the tensor and get - // TopK - - // first step, prepare the trans args for the tranpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(output->dims()); - for (int i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - trans_out_dims[i] = out_dims[trans[i]]; - } - // second step, tranpose the input - Tensor trans_input; - trans_input.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - const auto& dev_ctx = ctx.cuda_device_context(); - TransCompute(ndims, dev_ctx, *input, - &trans_input, trans); - // third step, calcluate the topk - // allocate the tmp cuda memory for the tmp result - Tensor trans_ind; - trans_ind.mutable_data(trans_out_dims, ctx.GetPlace()); - Tensor trans_out; - trans_out.mutable_data(trans_out_dims, ctx.GetPlace()); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - if (k > input_width) k = input_width; - - // The conclusion is drawn from the data through multiple sets of - // statistics - if (input_width >= 128 && k >= input_width * 0.75) { - if (SortTopk(dev_ctx, &trans_input, input_width, input_height, k, - &trans_out, &trans_ind, largest)) { - // last step, tranpose back the indices and output - TransCompute( - ndims, dev_ctx, trans_ind, indices, trans); - TransCompute( - ndims, dev_ctx, trans_out, output, trans); - return; - } else { - LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use " - "default topk kernel."; - } - } - - const int kMaxHeight = 2048; - int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; - switch (GetDesiredBlockDim(input_width)) { -#ifdef PADDLE_WITH_HIP - FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( - trans_out.data(), k, trans_ind.data(), - trans_input.data(), input_width, input_width, - static_cast(k), gridx, input_height, largest)); -#else - FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( - trans_out.data(), k, trans_ind.data(), - trans_input.data(), input_width, input_width, - static_cast(k), gridx, input_height, largest)); -#endif - default: - PADDLE_THROW(platform::errors::Fatal( - "the input data shape has error in the topk cuda kernel.")); - } - - // last step, tranpose back the indices and output - TransCompute( - ndims, dev_ctx, trans_ind, indices, trans); - TransCompute(ndims, dev_ctx, trans_out, - output, trans); - } - } -}; - -#undef FIXED_BLOCK_DIM_BASE -#undef FIXED_BLOCK_DIM -template -class TopkV2OpGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(context.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* x = context.Input("X"); - auto* out_grad = context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = context.Output(framework::GradVarName("X")); - int axis = context.Attr("axis"); - - const auto& in_dims = x->dims(); - const auto& out_dims = indices->dims(); - - // get the real the axis and the k - if (axis < 0) axis += in_dims.size(); - const int& k = out_dims[axis]; - const int& raw_height = in_dims[axis]; - - // allocate the cuda memory for the x_grad - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - const T* out_grad_data = out_grad->data(); - const int64_t* indices_data = indices->data(); - - int pre, n, post; - GetDims(in_dims, axis, &pre, &n, &post); - - // calcluate the block and grid num - auto& dev_ctx = context.cuda_device_context(); - auto ComputeBlockSize = [](int col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; - }; - int block_size = ComputeBlockSize(post * k); - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); - int grid_size = std::min(max_blocks, pre); - - // lanuch the cuda kernel to assign the grad - AssignGradWithAxis<<>>( - out_grad_data, indices_data, x_grad_data, pre, post, n, k); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL( - top_k_v2, - paddle::operators::TopkV2OpCUDAKernel, - paddle::operators::TopkV2OpCUDAKernel, - paddle::operators::TopkV2OpCUDAKernel, - paddle::operators::TopkV2OpCUDAKernel, - paddle::operators::TopkV2OpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL( - top_k_v2_grad, paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, float>, - paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, double>, - paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, int>, - paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, int64_t>, - paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, paddle::platform::float16>); diff --git a/paddle/fluid/operators/top_k_v2_op.h b/paddle/fluid/operators/top_k_v2_op.h deleted file mode 100644 index a808207476f3b9be2636741d7b0ac06002ccba08..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/top_k_v2_op.h +++ /dev/null @@ -1,335 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* - The reason why we need the topk v2 is because the compatibility. We redefine - the NaN is maximum value - in the process of comparing. If do not add the topk v2, will affect the - inference result of model that traing - by the older version paddlepaddle. -*/ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/top_k_op.h" -#include "paddle/fluid/operators/transpose_op.h" - -namespace paddle { -namespace operators { - -inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n, - int* post) { - *pre = 1; - *post = 1; - *n = dim[axis]; - for (int i = 0; i < axis; ++i) { - (*pre) *= dim[i]; - } - for (int i = axis + 1; i < dim.size(); ++i) { - (*post) *= dim[i]; - } -} - -template -static void FullTopK(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, T* t_out, Type* t_indices, - const int& k, const bool& largest, const bool& sorted) { - // when the k is small, will the partial sort - bool partial_sort_flag = (k * 64) < input_width; - -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - // Eigen::DSizes flat2dims(input_height, input_width); - for (Type i = 0; i < input_height; ++i) { - std::vector> col_vec; - col_vec.reserve(input_width); - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(j), j)); - } - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(i, j), j)); - } - } - if (partial_sort_flag) { - std::partial_sort( - col_vec.begin(), col_vec.begin() + k, col_vec.end(), - [&largest](const std::pair& l, const std::pair& r) { - if (largest) { - return (std::isnan(static_cast(l.first)) && - !std::isnan(static_cast(r.first))) || - (l.first > r.first); - } else { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - } - }); - } else { - // use the nth-element to get the K-larger or K-small element - if (largest) { - std::nth_element( - col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (std::isnan(static_cast(l.first)) && - !std::isnan(static_cast(r.first))) || - (l.first > r.first); - }); - // the nth-element will get the unorder elements, sort the element - if (sorted) { - std::sort(col_vec.begin(), col_vec.begin() + k - 1, - [&largest](const std::pair& l, - const std::pair& r) { - return (std::isnan(static_cast(l.first)) && - !std::isnan(static_cast(r.first))) || - (l.first > r.first); - }); - } - } else { - std::nth_element( - col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - // the nth-element will get the unorder elements, sort the element - if (sorted) { - std::sort( - col_vec.begin(), col_vec.begin() + k - 1, - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - } - } - } - for (Type j = 0; j < k; ++j) { - t_out[i * k + j] = col_vec[j].first; - t_indices[i * k + j] = col_vec[j].second; - } - } -} - -template -static void FullTopKAssign(const Type& input_height, const Type& input_width, - const int& input_dim, const framework::Tensor* input, - const framework::Tensor* indices, T* output_data, - const int& k) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - auto e_indices = framework::EigenVector::Flatten(*indices); - for (Type j = 0; j < k; ++j) { - output_data[i * input_width + e_indices(j)] = e_input(j); - } - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = - framework::EigenMatrix::Reshape(*indices, input_dim - 1); - for (Type j = 0; j < k; ++j) { - output_data[i * input_width + e_indices(i, j)] = e_input(i, j); - } - } - } -} - -template -class TopkV2Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - // Get the top k elements of each row of input tensor - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - auto* indices = context.Output("Indices"); - const auto& in_dims = input->dims(); - int k = static_cast(context.Attr("k")); - const auto& sorted = static_cast(context.Attr("sorted")); - const auto& largest = static_cast(context.Attr("largest")); - - // axis < 0, cacluate the real axis - int axis = static_cast(context.Attr("axis")); - if (axis < 0) axis += in_dims.size(); - - // if K tensor is not null, will the use K tesnor as k - auto* k_t = context.Input("K"); - if (k_t) { - k = k_t->data()[0]; - framework::DDim output_dims = output->dims(); - // accroding to axis to set K value in the dim - output_dims[axis] = k; - output->Resize(output_dims); - indices->Resize(output_dims); - } - - T* output_data = output->mutable_data(context.GetPlace()); - int64_t* indices_data = indices->mutable_data(context.GetPlace()); - const auto& out_dims = output->dims(); - if (axis + 1 == in_dims.size()) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - FullTopK(input_height, input_width, in_dims.size(), input, - output_data, indices_data, k, largest, sorted); - } else { - // if the topk dims is not last dim, will tranpose and do topk - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - - // get the trans input_dims, out_dims - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(output->dims()); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - for (size_t i = 0; i < trans.size(); i++) { - trans_out_dims[i] = out_dims[trans[i]]; - } - - Tensor trans_inp; - trans_inp.mutable_data(trans_dims, context.GetPlace()); - int ndims = trans.size(); - auto& dev_context = - context.template device_context(); - - // transpose the input value - TransCompute(ndims, dev_context, *input, - &trans_inp, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - // Allocate the temp tensor to the save the topk indices, values - Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_out_dims, context.GetPlace()); - Tensor tmp_indices; - auto* t_ind = - tmp_indices.mutable_data(trans_out_dims, context.GetPlace()); - - // get the TopK value - FullTopK(input_height, input_width, in_dims.size(), - &trans_inp, t_out, t_ind, k, largest, sorted); - // transpose back - TransCompute( - ndims, dev_context, tmp_indices, indices, trans); - TransCompute(ndims, dev_context, tmp_out, - output, trans); - } - } -}; - -template -class TopkV2GradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out_grad = context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = context.Output(framework::GradVarName("X")); - int axis = static_cast(context.Attr("axis")); - - const auto& in_dims = x->dims(); - const auto& out_dims = indices->dims(); - - // axis < 0, get the real axis - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - const size_t& k = out_dims[axis]; - - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - if (axis + 1 == in_dims.size()) { - // allocate the memory for the input_grad - - // assign the out_grad to input_grad directly - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - // init the output grad with 0, because some input elements has no grad - memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); - // Assign the output_grad to input_grad - FullTopKAssign(input_height, input_width, in_dims.size(), out_grad, - indices, x_grad_data, k); - } else { - // can not assign grad to input_grad, must do the transpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(out_dims.size() - 1); - for (int i = axis + 1; i < out_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - framework::DDim trans_dims(out_dims); - framework::DDim trans_in_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = out_dims[trans[i]]; - trans_in_dims[i] = in_dims[trans[i]]; - } - // transpose the out_grad, indices - Tensor trans_dO; - trans_dO.mutable_data(trans_dims, context.GetPlace()); - Tensor trans_ind; - trans_ind.mutable_data(trans_dims, context.GetPlace()); - int ndims = trans.size(); - auto& dev_context = - context.template device_context(); - - // Do transpose - TransCompute(ndims, dev_context, *out_grad, - &trans_dO, trans); - TransCompute( - ndims, dev_context, *indices, &trans_ind, trans); - const int64_t input_height = phi::product( - phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1)); - const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1]; - - // Assign the out_grad to tranpose input_grad - Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_in_dims, context.GetPlace()); - memset(t_out, 0, x_grad->numel() * sizeof(T)); - - FullTopKAssign(input_height, input_width, in_dims.size(), - &trans_dO, &trans_ind, t_out, k); - - // Transpose back - TransCompute(ndims, dev_context, tmp_out, - x_grad, trans); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/top_k_v2_op_mlu.cc b/paddle/fluid/operators/top_k_v2_op_mlu.cc index 5b8a6b3e75449508afa5d316d81f97ab815c9ea9..caaae02124c926b9e4be08744e4192dab20ca5d0 100644 --- a/paddle/fluid/operators/top_k_v2_op_mlu.cc +++ b/paddle/fluid/operators/top_k_v2_op_mlu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/top_k_v2_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc index e11070638834c46a6628d652216e1ddddeb2487d..dff5c2d3f39378486bb5d2f8010d005d57b20550 100644 --- a/paddle/fluid/operators/top_k_v2_op_npu.cc +++ b/paddle/fluid/operators/top_k_v2_op_npu.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/top_k_v2_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/top_k_v2_op_xpu.cc b/paddle/fluid/operators/top_k_v2_op_xpu.cc index 49daac2ff0da63c542a807dc97925c6989559f14..4d9c39be92eff029e66cdde900318b045c2b531f 100644 --- a/paddle/fluid/operators/top_k_v2_op_xpu.cc +++ b/paddle/fluid/operators/top_k_v2_op_xpu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include -#include "paddle/fluid/operators/top_k_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/transpose_op.h" #include "xpu/refactor/math.h" diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc index 63b914a31a86aef48e952a4877c7beb670075cc4..c6c0fa3c0019eac742a9c70ea53a438f5a474895 100644 --- a/paddle/fluid/operators/trace_op.cc +++ b/paddle/fluid/operators/trace_op.cc @@ -61,7 +61,7 @@ the 2-D planes specified by dim1 and dim2. )DOC"); } }; -class TraceOpGrad : public framework::OperatorWithKernel { +class TraceGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -107,14 +107,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(TraceGradNoNeedBufferVarsInferer, "Input"); } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(trace, TraceInferShapeFunctor, - PT_INFER_META(phi::TraceInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(trace, TraceInferShapeFunctor, + PD_INFER_META(phi::TraceInferMeta)); REGISTER_OPERATOR(trace, ops::TraceOp, ops::TraceOpMaker, ops::TraceGradOpMaker, ops::TraceGradOpMaker, TraceInferShapeFunctor); -REGISTER_OPERATOR(trace_grad, ops::TraceOpGrad, +REGISTER_OPERATOR(trace_grad, ops::TraceGradOp, ops::TraceGradNoNeedBufferVarsInferer); /* ========================== register checkpoint ===========================*/ diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc index 768ab21936f1efbd2f50470446fd3f8d3ecb094c..1a297e7238ccdacd9b4986a5fe69e155d30e4318 100644 --- a/paddle/fluid/operators/transpose_op.cc +++ b/paddle/fluid/operators/transpose_op.cc @@ -339,6 +339,14 @@ class Transpose2OpGrad : public framework::OperatorWithKernel { } }; +class TransposeGradInferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + ctx->SyncTypeAndDataType(framework::GradVarName("Out"), + framework::GradVarName("X")); + } +}; + } // namespace operators } // namespace paddle @@ -347,59 +355,13 @@ REGISTER_OPERATOR( transpose, ops::TransposeOp, ops::TransposeOpMaker, paddle::framework::DefaultGradOpMaker, paddle::framework::DefaultGradOpMaker); -REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad); - -REGISTER_OP_CPU_KERNEL( - transpose, ops::TransposeKernel, - ops::TransposeKernel, - ops::TransposeKernel, - ops::TransposeKernel>, - ops::TransposeKernel>, - ops::TransposeKernel); -REGISTER_OP_CPU_KERNEL( - transpose_grad, - ops::TransposeGradKernel, - ops::TransposeGradKernel, - ops::TransposeGradKernel, - ops::TransposeGradKernel>, - ops::TransposeGradKernel>, - ops::TransposeGradKernel); +REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad, + ops::TransposeGradInferVarType); REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker, ops::Transpose2GradMaker, ops::Transpose2GradMaker); REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad, + ops::TransposeGradInferVarType, ops::Transpose2DoubleGradMaker, ops::Transpose2DoubleGradMaker); - -REGISTER_OP_CPU_KERNEL( - transpose2, ops::TransposeKernel, - ops::TransposeKernel, - ops::TransposeKernel, - ops::TransposeKernel, - ops::TransposeKernel, - ops::TransposeKernel>, - ops::TransposeKernel>, - ops::TransposeKernel); -REGISTER_OP_CPU_KERNEL( - transpose2_grad, - ops::TransposeGradKernel, - ops::TransposeGradKernel, - ops::TransposeGradKernel, - ops::TransposeGradKernel, - ops::TransposeGradKernel, - ops::TransposeGradKernel>, - ops::TransposeGradKernel>, - ops::TransposeGradKernel); diff --git a/paddle/fluid/operators/transpose_op.cu b/paddle/fluid/operators/transpose_op.cu deleted file mode 100644 index 02e224549a5abfb14729355addeb52824e450570..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/transpose_op.cu +++ /dev/null @@ -1,139 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/transpose_op.cu.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { - -template -class TransposeGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.InputVar("X"); - auto* out = context.OutputVar("Out"); - - const framework::Tensor* x_tensor = - GetLoDTensorOrSelectedRowsValueFromVar(*x); - framework::Tensor* out_tensor = - GetMutableLoDTensorOrSelectedRowsValueFromVar(out); - - out_tensor->mutable_data(context.GetPlace()); - if (out_tensor->numel() == 0) { - return; - } - - std::vector axis = context.Attr>("axis"); - int ndims = axis.size(); - const auto& dev_ctx = context.template device_context(); - TransposeGPUKernelDriver(dev_ctx, ndims, *x_tensor, axis, out_tensor); - } -}; -template -class TransposeGradGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out_grad = context.InputVar(framework::GradVarName("Out")); - auto* x_grad = context.OutputVar(framework::GradVarName("X")); - if (!x_grad) { - return; - } - - const framework::Tensor* out_grad_tensor = - GetLoDTensorOrSelectedRowsValueFromVar(*out_grad); - framework::Tensor* x_grad_tensor = - GetMutableLoDTensorOrSelectedRowsValueFromVar(x_grad); - - x_grad_tensor->mutable_data(context.GetPlace()); - if (x_grad_tensor->numel() == 0) { - return; - } - std::vector axis = context.Attr>("axis"); - std::vector reversed_axis(axis); - - for (size_t i = 0; i < axis.size(); i++) { - reversed_axis[axis[i]] = i; - } - - int ndims = axis.size(); - const auto& dev_ctx = context.template device_context(); - TransposeGPUKernelDriver(dev_ctx, ndims, *out_grad_tensor, reversed_axis, - x_grad_tensor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - transpose, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel>, - ops::TransposeGPUKernel>); -REGISTER_OP_CUDA_KERNEL( - transpose_grad, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel>, - ops::TransposeGradGPUKernel>); - -REGISTER_OP_CUDA_KERNEL( - transpose2, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel>, - ops::TransposeGPUKernel>); -REGISTER_OP_CUDA_KERNEL( - transpose2_grad, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel>, - ops::TransposeGradGPUKernel>); diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h index b542fa37f88fd3e4d53b475d49d8f0491b9b5b42..a31ac28c9910c0c36b28c98fd3d83476f002df7e 100644 --- a/paddle/fluid/operators/transpose_op.cu.h +++ b/paddle/fluid/operators/transpose_op.cu.h @@ -16,8 +16,9 @@ limitations under the License. */ #include "paddle/fluid/framework/gpu_utils.h" #include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" namespace paddle { namespace operators { @@ -258,10 +259,10 @@ struct SystemElemType<16> { }; template -void LaunchNarrowDims2TransposeKernel(const platform::CUDADeviceContext& d, - int tile_size_i, int tile_size_j, - int total_tiles_count, const T* input, - const Dim3& input_dims, T* output) { +void LaunchNarrowDims2TransposeKernel(const phi::GPUContext& d, int tile_size_i, + int tile_size_j, int total_tiles_count, + const T* input, const Dim3& input_dims, + T* output) { constexpr int NumThreads = tile_long; if (tile_size_i <= tile_long && tile_size_j <= tile_short) { TilingSwapDim1And2< @@ -278,7 +279,7 @@ void LaunchNarrowDims2TransposeKernel(const platform::CUDADeviceContext& d, template struct NarrowDims2TransposeDispatch { - static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i, + static void DoTranspose(const phi::GPUContext& d, int tile_size_i, int tile_size_j, int total_tiles_count, const T* input, const Dim3& input_dims, T* output) { PADDLE_ENFORCE_EQ( @@ -319,7 +320,7 @@ struct NarrowDims2TransposeDispatch< T, tile_long, tile_short, typename std::enable_if< CheckNonLongTileSize(tile_long, tile_short, sizeof(T)), void>::type> { - static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i, + static void DoTranspose(const phi::GPUContext& d, int tile_size_i, int tile_size_j, int total_tiles_count, const T* input, const Dim3& input_dims, T* output) { PADDLE_ENFORCE_EQ( @@ -351,7 +352,7 @@ struct NarrowDims2TransposeDispatch< T, tile_long, tile_short, typename std::enable_if::type> { - static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i, + static void DoTranspose(const phi::GPUContext& d, int tile_size_i, int tile_size_j, int total_tiles_count, const T* input, const Dim3& input_dims, T* output) { PADDLE_ENFORCE_EQ( @@ -368,7 +369,7 @@ struct NarrowDims2TransposeDispatch< }; template -void SwapDim1And2InNarrow(const platform::CUDADeviceContext& d, const T* input, +void SwapDim1And2InNarrow(const phi::GPUContext& d, const T* input, const Dim3& input_dims, T* output, const int kMinTileSize) { // First get available tile sizes for the data type requested as backups @@ -473,9 +474,8 @@ __global__ void TransposeSimpleKernel(int nthreads, const T* __restrict__ input, // Here suppose convert all tensor to dim3, so just change dim1 and 2. template -void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d, - const T* input, const Dim3& input_dims, - T* output) { +void SendSwapDim1And2InTranspose(const phi::GPUContext& d, const T* input, + const Dim3& input_dims, T* output) { // Suppose tile size > 16 static const int kMinTileSize = 16; static const int kMinNarrowTileSize = 96; @@ -512,7 +512,7 @@ void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d, } else { // If input shape is small, such as 8X8, just do simple copy int total_elements = input_dims[0] * input_dims[1] * input_dims[2]; - auto config = GetGpuLaunchConfig1D(d, total_elements); + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(d, total_elements); TransposeSimpleKernel<<< config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>( total_elements, input, input_dims, output); @@ -521,7 +521,7 @@ void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d, template struct SwapDim1And2InTranspose { - typedef platform::CUDADeviceContext Device; + typedef phi::GPUContext Device; void operator()(const Device& d, const T* in, const std::vector& combined_dims, T* out) { Dim3 input_dims = {static_cast(combined_dims[0]), @@ -533,7 +533,7 @@ struct SwapDim1And2InTranspose { template struct SwapDim0And2InTranspose { - typedef platform::CUDADeviceContext Device; + typedef phi::GPUContext Device; void operator()(const Device& d, const T* in, const std::vector& combined_dims, T* out) { Dim3 input_dims = {static_cast(combined_dims[0]), @@ -541,7 +541,7 @@ struct SwapDim0And2InTranspose { static_cast(combined_dims[2])}; size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2]; - auto config = GetGpuLaunchConfig1D(d, total_size); + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(d, total_size); TransposeSimpleKernel<<< config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>( @@ -607,7 +607,7 @@ inline void CombineTransposeDim3(const framework::DDim& shape, template struct TransposeSimple { - static bool run(const platform::CUDADeviceContext& ctx, const Tensor& in, + static bool run(const phi::GPUContext& ctx, const Tensor& in, const std::vector perm, Tensor* out) { // First reduce the dimensions of the input tensor if possible. std::vector new_perm; @@ -654,12 +654,12 @@ struct TransposeSimple { }; template -void TransposeGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, - const int ndims, const Tensor& in, - const std::vector perm, Tensor* out) { +void TransposeGPUKernelDriver(const phi::GPUContext& dev_ctx, const int ndims, + const Tensor& in, + const std::vector& perm, Tensor* out) { auto ret = TransposeSimple::run(dev_ctx, in, perm, out); if (!ret) { - TransCompute(ndims, dev_ctx, in, out, perm); + TransCompute(ndims, dev_ctx, in, out, perm); } } diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h index ec05a534c0ef5327ec5d6d7f89b4e16b7a829434..a9e4876cc82a44ef8e87049a199ce0b58a96f6ea 100644 --- a/paddle/fluid/operators/transpose_op.h +++ b/paddle/fluid/operators/transpose_op.h @@ -59,63 +59,5 @@ inline void TransCompute(const int dim, const DeviceContext& dev_ctx, } } -template -class TransposeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.InputVar("X"); - auto* out = context.OutputVar("Out"); - - const framework::Tensor* x_tensor = - GetLoDTensorOrSelectedRowsValueFromVar(*x); - framework::Tensor* out_tensor = - GetMutableLoDTensorOrSelectedRowsValueFromVar(out); - - out_tensor->mutable_data(context.GetPlace()); - if (out_tensor->numel() == 0) { - return; - } - - std::vector axis = context.Attr>("axis"); - int ndims = axis.size(); - auto& dev_ctx = context.template device_context(); - TransCompute(ndims, dev_ctx, *x_tensor, out_tensor, axis); - } -}; - -template -class TransposeGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out_grad = context.InputVar(framework::GradVarName("Out")); - auto* x_grad = context.OutputVar(framework::GradVarName("X")); - - if (!x_grad) { - return; - } - const framework::Tensor* out_grad_tensor = - GetLoDTensorOrSelectedRowsValueFromVar(*out_grad); - framework::Tensor* x_grad_tensor = - GetMutableLoDTensorOrSelectedRowsValueFromVar(x_grad); - - x_grad_tensor->mutable_data(context.GetPlace()); - if (x_grad_tensor->numel() == 0) { - return; - } - - std::vector axis = context.Attr>("axis"); - std::vector reversed_axis(axis); - - for (size_t i = 0; i < axis.size(); i++) { - reversed_axis[axis[i]] = i; - } - - int ndims = axis.size(); - auto& dev_ctx = context.template device_context(); - TransCompute(ndims, dev_ctx, *out_grad_tensor, - x_grad_tensor, reversed_axis); - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/transpose_op_mlu.cc b/paddle/fluid/operators/transpose_op_mlu.cc new file mode 100644 index 0000000000000000000000000000000000000000..40cb22bab50ec0de5cc0fb9a2c6953637a238599 --- /dev/null +++ b/paddle/fluid/operators/transpose_op_mlu.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/transpose_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +template +class TransposeMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + std::vector axis = ctx.Attr>("axis"); + out->mutable_data(ctx.device_context().GetPlace()); + + TransposeFromMLUTensor(ctx, axis, x, out, + false /*need_reshape_or_alloc*/); + } +}; + +template +class TransposeGradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto* x_grad = + ctx.Output(framework::GradVarName("X")); + std::vector axis = ctx.Attr>("axis"); + std::vector reversed_axis(axis); + for (size_t i = 0; i < axis.size(); i++) { + reversed_axis[axis[i]] = i; + } + x_grad->mutable_data(ctx.GetPlace()); + + TransposeFromMLUTensor(ctx, reversed_axis, out_grad, x_grad, + false /*need_reshape_or_alloc*/); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_MLU_KERNEL(transpose2, ops::TransposeMLUKernel, + ops::TransposeMLUKernel, + ops::TransposeMLUKernel, + ops::TransposeMLUKernel, + ops::TransposeMLUKernel, + ops::TransposeMLUKernel, + ops::TransposeMLUKernel); + +REGISTER_OP_MLU_KERNEL(transpose2_grad, ops::TransposeGradMLUKernel, + ops::TransposeGradMLUKernel, + ops::TransposeGradMLUKernel, + ops::TransposeGradMLUKernel, + ops::TransposeGradMLUKernel, + ops::TransposeGradMLUKernel, + ops::TransposeGradMLUKernel); diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc index cce3f188c8b7429447309e989e1e0dd5b9f13be0..fb39034c8e92c1ac39aa1ca6e57d5a08ca1ca9d6 100644 --- a/paddle/fluid/operators/transpose_op_npu_test.cc +++ b/paddle/fluid/operators/transpose_op_npu_test.cc @@ -24,14 +24,13 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(transpose2); +USE_OP_ITSELF(transpose2); USE_OP_DEVICE_KERNEL(transpose2, NPU); template diff --git a/paddle/fluid/operators/triangular_solve_op.cc b/paddle/fluid/operators/triangular_solve_op.cc index 9233917b0931b98d30b736ec9b69fd68c0604d18..df84659a00f4c4220853404a8b28c6ccc93623a3 100644 --- a/paddle/fluid/operators/triangular_solve_op.cc +++ b/paddle/fluid/operators/triangular_solve_op.cc @@ -13,7 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/triangular_solve_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/solve_op.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -22,58 +25,6 @@ class TriangularSolveOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "TriangularSolve"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "TriangularSolve"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "TriangularSolve"); - - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - - auto x_dims_n = x_dims.size(); - auto y_dims_n = y_dims.size(); - - PADDLE_ENFORCE_GE( - x_dims_n, 2, platform::errors::InvalidArgument( - "The input tensor X's dimensions of TriangularSolveOp " - "should be >= 2. But received X's " - "dimensions = %d, X's shape = [%s]", - x_dims.size(), x_dims)); - - PADDLE_ENFORCE_GE( - y_dims_n, 2, platform::errors::InvalidArgument( - "The input tensor Y's dimensions of TriangularSolveOp " - "should be >=2. But received Y's " - "dimensions = %d, Y's shape = [%s]", - y_dims.size(), y_dims)); - - PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2], x_dims[x_dims_n - 1], - platform::errors::InvalidArgument( - "The inner-most 2 dimensions of Input(X) all should " - "be square matrices " - "But received X's shape[-2] = %d and shape[-1] = %d.", - x_dims[x_dims_n - 2], x_dims[x_dims_n - 1])); - - std::vector x_dims_vec = phi::vectorize(x_dims); - std::vector y_dims_vec = phi::vectorize(y_dims); - - std::vector x_dims_vec_cut(x_dims_vec.begin(), - x_dims_vec.end() - 2); - std::vector y_dims_vec_cut(y_dims_vec.begin(), - y_dims_vec.end() - 2); - - std::vector expand_batch_portion = - get_broadcast_batch_portion(x_dims_vec_cut, y_dims_vec_cut); - - std::vector y_broadcast_dims({expand_batch_portion}); - y_broadcast_dims.insert(y_broadcast_dims.end(), {y_dims_vec[y_dims_n - 2], - y_dims_vec[y_dims_n - 1]}); - - // dim of 'Out' is the same with 'Y' after broadcast - ctx->SetOutputDim("Out", phi::make_ddim(y_broadcast_dims)); - ctx->ShareLoD("X", /*->*/ "Out"); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const { return framework::OpKernelType( @@ -168,20 +119,15 @@ class TriangularSolveOpGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(triangular_solve, TriangularSolveInferShapeFunctor, + PD_INFER_META(phi::TriangularSolveInferMeta)); + REGISTER_OPERATOR(triangular_solve, ops::TriangularSolveOp, ops::TriangularSolveOpMaker, ops::TriangularSolveOpInferVarType, ops::TriangularSolveOpGradMaker, - ops::TriangularSolveOpGradMaker); + ops::TriangularSolveOpGradMaker, + TriangularSolveInferShapeFunctor); REGISTER_OPERATOR(triangular_solve_grad, ops::TriangularSolveGradOp); - -REGISTER_OP_CPU_KERNEL( - triangular_solve, - ops::TriangularSolveKernel, - ops::TriangularSolveKernel); - -REGISTER_OP_CPU_KERNEL( - triangular_solve_grad, - ops::TriangularSolveGradKernel, - ops::TriangularSolveGradKernel); diff --git a/paddle/fluid/operators/triangular_solve_op.cu b/paddle/fluid/operators/triangular_solve_op.cu deleted file mode 100644 index 7df98517e8418905f0f8c8ce603762967a8b5f38..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/triangular_solve_op.cu +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -#include "paddle/fluid/operators/triangular_solve_op.h" - -namespace paddle { -namespace operators { - -template -class MatrixReduceSumFunctor { - public: - void operator()(const Tensor& in, Tensor* out, - const framework::ExecutionContext& ctx) { - // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3] - // out_reduce_dim should be [0, 2] - const std::vector in_dims = phi::vectorize(in.dims()); - auto in_size = in_dims.size(); - const std::vector out_dims = phi::vectorize(out->dims()); - auto out_size = out_dims.size(); - - std::vector out_bst_dims(in_size); - - std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1); - std::copy(out_dims.data(), out_dims.data() + out_size, - out_bst_dims.data() + in_size - out_size); - - std::vector out_reduce_dims; - for (size_t idx = 0; idx <= in_size - 3; idx++) { - if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) { - out_reduce_dims.push_back(idx); - } - } - gpuStream_t stream = ctx.cuda_device_context().stream(); - TensorReduceImpl>( - ctx.cuda_device_context(), in, out, kps::IdentityFunctor(), - out_reduce_dims, stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - triangular_solve, - ops::TriangularSolveKernel, - ops::TriangularSolveKernel); - -REGISTER_OP_CUDA_KERNEL( - triangular_solve_grad, - ops::TriangularSolveGradKernel, - ops::TriangularSolveGradKernel); diff --git a/paddle/fluid/operators/triangular_solve_op.h b/paddle/fluid/operators/triangular_solve_op.h index 4e68add096ff28f5378b02689248c3957c1e8ae9..fd46aca456cd9bd883cf9d1ce3576b307794b1a5 100644 --- a/paddle/fluid/operators/triangular_solve_op.h +++ b/paddle/fluid/operators/triangular_solve_op.h @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/operators/solve_op.h" #include "paddle/fluid/operators/tril_triu_op.h" #include "paddle/phi/core/ddim.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/complex_functors.h" namespace paddle { @@ -30,10 +29,10 @@ namespace operators { using Tensor = framework::Tensor; template -static void triangular_solve(const DeviceContext& context, const Tensor& x, - const Tensor& y, Tensor* out, bool upper, +static void triangular_solve(const DeviceContext &context, const Tensor &x, + const Tensor &y, Tensor *out, bool upper, bool transpose, bool unitriangular) { - // Tensor broadcast use eigen + // Tensor broadcast use eigen library std::vector x_bst_dims_vec; std::vector y_bst_dims_vec; std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(x, y); @@ -61,169 +60,5 @@ static void triangular_solve(const DeviceContext& context, const Tensor& x, unitriangular); } -template -class MatrixReduceSumFunctor { - public: - void operator()(const Tensor& input, Tensor* output, - const framework::ExecutionContext& ctx); -}; - -template -class MatrixReduceSumFunctor { - public: - void operator()(const Tensor& in, Tensor* out, - const framework::ExecutionContext& ctx) { - // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3] - // out_reduce_dim should be [0, 2] - const std::vector in_dims = phi::vectorize(in.dims()); - auto in_size = in_dims.size(); - const std::vector out_dims = phi::vectorize(out->dims()); - auto out_size = out_dims.size(); - - std::vector out_bst_dims(in_size); - - std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1); - std::copy(out_dims.data(), out_dims.data() + out_size, - out_bst_dims.data() + in_size - out_size); - out->Resize(phi::make_ddim(out_bst_dims)); - - std::vector out_reduce_dims; - for (size_t idx = 0; idx <= in_size - 3; idx++) { - if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) { - out_reduce_dims.push_back(idx); - } - } - - ReduceKernelFunctor( - &in, out, out_reduce_dims, true, false, ctx) - .template apply(); - out->Resize(phi::make_ddim(out_dims)); - } -}; - -template -class TriangularSolveKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* x = ctx.Input("X"); - const auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - - bool upper = ctx.template Attr("upper"); - bool transpose = ctx.template Attr("transpose"); - bool unitriangular = ctx.template Attr("unitriangular"); - - const auto& dev_ctx = ctx.template device_context(); - triangular_solve(dev_ctx, *x, *y, out, upper, transpose, - unitriangular); - } -}; - -template -class TriangularSolveGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* x = ctx.Input("X"); - const auto* y = ctx.Input("Y"); - const auto* out = ctx.Input("Out"); - const auto* dout = - ctx.Input(framework::GradVarName("Out")); - - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - bool upper = ctx.template Attr("upper"); - bool transpose = ctx.template Attr("transpose"); - bool unitriangular = ctx.template Attr("unitriangular"); - - auto& dev_ctx = ctx.template device_context(); - - std::vector x_bst_dims_vec; - std::vector y_bst_dims_vec; - std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(*x, *y); - - Tensor dy_bst(y->type()); - if (dy) { - dy->mutable_data(y->dims(), dev_ctx.GetPlace()); - dy_bst.Resize(phi::make_ddim(y_bst_dims_vec)); - dy_bst.mutable_data(dev_ctx.GetPlace()); - - // calculate x's conjugate for complex - Tensor x_conj(x->type()); - platform::ForRange x_for_range(dev_ctx, x->numel()); - phi::funcs::ConjFunctor x_functor( - x->data(), x->numel(), - x_conj.mutable_data(x->dims(), dev_ctx.GetPlace())); - x_for_range(x_functor); - - // reuse forward to get dy_bst, and the result has been broadcated. - triangular_solve(dev_ctx, x_conj, *dout, &dy_bst, upper, - !transpose, unitriangular); - - if (dy_bst.dims() == dy->dims()) { - framework::TensorCopy(dy_bst, dev_ctx.GetPlace(), dev_ctx, dy); - } else { - MatrixReduceSumFunctor functor; - functor(dy_bst, dy, ctx); - dy->Resize(y->dims()); - } - } - - Tensor dx_bst(x->type()); - if (dx) { - dx->mutable_data(x->dims(), dev_ctx.GetPlace()); - dx_bst.Resize(phi::make_ddim(x_bst_dims_vec)); - dx_bst.mutable_data(dev_ctx.GetPlace()); - - // calculate out's conjugate for complex - Tensor out_conj(out->type()); - platform::ForRange out_for_range(dev_ctx, out->numel()); - phi::funcs::ConjFunctor out_functor( - out->data(), out->numel(), - out_conj.mutable_data(out->dims(), dev_ctx.GetPlace())); - out_for_range(out_functor); - - auto blas = phi::funcs::GetBlas(ctx); - if (transpose) { - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, true); - blas.MatMul(out_conj, mat_dim_a, dy_bst, mat_dim_b, static_cast(-1), - &dx_bst, static_cast(0)); - } else { - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, true); - blas.MatMul(dy_bst, mat_dim_a, out_conj, mat_dim_b, static_cast(-1), - &dx_bst, static_cast(0)); - } - - Tensor dx_bst_upper(x->type()); - // get upper or lower triangular - dx_bst_upper.Resize(dx_bst.dims()); - dx_bst_upper.mutable_data(dev_ctx.GetPlace()); - - const auto& dims = dx_bst.dims(); - const auto H = dims[dims.size() - 2]; - const auto W = dims[dims.size() - 1]; - platform::ForRange x_for_range(dev_ctx, dx_bst.numel()); - TrilTriuCompute tril_triu_computer(dx_bst.data(), unitriangular, - !upper, H, W, - dx_bst_upper.data()); - x_for_range(tril_triu_computer); - - if (dx_bst_upper.dims() == dx->dims()) { - framework::TensorCopy(dx_bst_upper, dev_ctx.GetPlace(), dev_ctx, dx); - } else { - MatrixReduceSumFunctor functor; - functor(dx_bst_upper, dx, ctx); - dx->Resize(x->dims()); - } - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/tril_triu_op_xpu.cc b/paddle/fluid/operators/tril_triu_op_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..e36cbcf228cfbf30c8fcd5562ac40f38a5467cdb --- /dev/null +++ b/paddle/fluid/operators/tril_triu_op_xpu.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under +the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/tril_triu_op.h" +#include "paddle/fluid/platform/device/device_wrapper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class TrilTriuXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const auto* x = context.Input("X"); + const auto* x_data = x->data(); + auto* out = context.Output("Out"); + auto* out_data = out->mutable_data(context.GetPlace()); + + const int diagonal = context.Attr("diagonal"); + const bool lower = context.Attr("lower"); + auto xshape = phi::vectorize(x->dims()); + auto& dev_ctx = context.template device_context(); + int r = 0; + if (lower) { + r = xpu::tril(dev_ctx.x_context(), x_data, out_data, xshape, diagonal); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "tril_op"); + } else { + r = xpu::triu(dev_ctx.x_context(), x_data, out_data, xshape, diagonal); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "triu_op"); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL( + tril_triu, ops::TrilTriuXPUKernel, + ops::TrilTriuXPUKernel); +#endif diff --git a/paddle/fluid/operators/trunc_op.cc b/paddle/fluid/operators/trunc_op.cc index 54f4deac80a74e2e471036c2e25d08a582e29a9d..b77775f5a8c094fc7aa05f2f017834681424207f 100644 --- a/paddle/fluid/operators/trunc_op.cc +++ b/paddle/fluid/operators/trunc_op.cc @@ -69,8 +69,8 @@ class TruncGradOpMaker : public framework::SingleGradOpMaker { } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(trunc, TruncInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(trunc, TruncInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); namespace ops = paddle::operators; REGISTER_OPERATOR(trunc, ops::TruncOp, ops::TruncOpMaker, diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc index 6eb7f922dfdbec41aa1c47d11e1decc259d08689..dc5a66dce16d698f9cfac01e3bdc776d08c2af19 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc @@ -17,8 +17,10 @@ limitations under the License. */ #include #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/truncated_gaussian_random_op.h" +#include "paddle/phi/infermeta/nullary.h" namespace paddle { namespace operators { @@ -27,26 +29,6 @@ class TruncatedGaussianRandomOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), true, - platform::errors::NotFound( - "Output(Out) of TruncatedGaussianRandomOp should not be null.")); - auto shape = ctx->Attrs().Get>("shape"); - std::vector out_dim; - out_dim.reserve(shape.size()); - for (auto dim : shape) { - out_dim.push_back(static_cast(dim)); - } - PADDLE_ENFORCE_GT( - shape.size(), 0UL, - platform::errors::InvalidArgument( - "the input shape of TruncatedGaussianRandomOp must be set, " - "But the rank of shape we received is %d", - shape.size())); - ctx->SetOutputDim("Out", phi::make_ddim(out_dim)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -99,6 +81,14 @@ Used to initialize tensors with truncated gaussian random generator. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(truncated_gaussian_random, - ops::TruncatedGaussianRandomOp, - ops::TruncatedGaussianRandomOpMaker); + +DECLARE_INFER_SHAPE_FUNCTOR( + truncated_gaussian_random, TruncatedGaussianRandomInferShapeFunctor, + PD_INFER_META(phi::TruncatedGaussianRandomInferMeta)); + +REGISTER_OPERATOR( + truncated_gaussian_random, ops::TruncatedGaussianRandomOp, + ops::TruncatedGaussianRandomOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + TruncatedGaussianRandomInferShapeFunctor); diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc index c45b839d5b40bd1d0db25743406bb8cc319f1280..02fed3de6cef74f19a5dd4d8500017e6097a56a4 100644 --- a/paddle/fluid/operators/unfold_op.cc +++ b/paddle/fluid/operators/unfold_op.cc @@ -119,8 +119,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnfoldGradOpNoNeedBufferVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(unfold, UnfoldInferShapeFunctor, - PT_INFER_META(phi::UnfoldInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(unfold, UnfoldInferShapeFunctor, + PD_INFER_META(phi::UnfoldInferMeta)); REGISTER_OPERATOR(unfold, ops::UnfoldOp, ops::UnfoldOpMaker, ops::UnfoldGradMaker, ops::UnfoldGradMaker, diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc index 353d653f48141b2e68db6143c1ca0859a9ecc13f..1c22e60fa87aa73246806e4f5bc70e49a3b0f958 100644 --- a/paddle/fluid/operators/uniform_random_op.cc +++ b/paddle/fluid/operators/uniform_random_op.cc @@ -281,10 +281,6 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, paddle::operators::UniformRandomOpVarTypeInference); -REGISTER_OP_CPU_KERNEL( - uniform_random, paddle::operators::CPUUniformRandomKernel, - paddle::operators::CPUUniformRandomKernel, - paddle::operators::CPUUniformRandomKernel); REGISTER_OP_CPU_KERNEL( uniform_random_batch_size_like, paddle::operators::CPUUniformRandomKernel, diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu index fb38a6aded4cf173bb4c0dd96d131ff520b6701e..2ceb8a68d863dfe71458c67deeac7f54df0a662b 100644 --- a/paddle/fluid/operators/uniform_random_op.cu +++ b/paddle/fluid/operators/uniform_random_op.cu @@ -58,9 +58,6 @@ class GPUUniformRandomKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_CUDA_KERNEL(uniform_random, - paddle::operators::GPUUniformRandomKernel, - paddle::operators::GPUUniformRandomKernel); REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like, paddle::operators::GPUUniformRandomKernel, paddle::operators::GPUUniformRandomKernel); diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h index a864c48ad757411861b6d2b3be40361c347601f8..b941dc21c3ab213e5abc2c4c908413b2b6222c41 100644 --- a/paddle/fluid/operators/uniform_random_op.h +++ b/paddle/fluid/operators/uniform_random_op.h @@ -25,8 +25,9 @@ DECLARE_bool(use_curand); #include #include #include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/operators/index_impl.cu.h" #include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" +#include "paddle/phi/kernels/funcs/index_impl.cu.h" #endif namespace paddle { @@ -206,21 +207,21 @@ void UniformRandom(const framework::ExecutionContext& context, if (gen_cuda->GetIsInitPy() && seed_flag) { if (FLAGS_use_curand) { using MT = typename details::MPTypeTrait::Type; - distribution::uniform_distribution dist; - distribution::uniform_transform trans(min, max); - distribution::distribution_and_transform(dev_cxt, tensor, dist, trans); + phi::funcs::uniform_distribution dist; + phi::funcs::uniform_real_transform trans(min, max); + phi::funcs::distribution_and_transform(dev_cxt, tensor, dist, trans); } else { auto seed_offset = gen_cuda->IncrementOffset(1); int64_t gen_offset = size * seed_offset.second; auto func = UniformGeneratorOffset(min, max, seed_offset.first, diag_num, diag_step, diag_val, gen_offset); - IndexKernel>(dev_cxt, tensor, func); + phi::IndexKernel>(dev_cxt, tensor, func); } } else { auto func = UniformGenerator(min, max, seed, diag_num, diag_step, diag_val); - IndexKernel>(dev_cxt, tensor, func); + phi::IndexKernel>(dev_cxt, tensor, func); } } #endif diff --git a/paddle/fluid/operators/uniform_random_op_mlu.cc b/paddle/fluid/operators/uniform_random_op_mlu.cc index 1600bedc6b2fae9ba65a32e831eae4f43abeddf8..2c5f13f5a930788651c2e287febab7ad06aefd20 100644 --- a/paddle/fluid/operators/uniform_random_op_mlu.cc +++ b/paddle/fluid/operators/uniform_random_op_mlu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/uniform_random_op.h" +#include "paddle/fluid/framework/generator.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { @@ -57,14 +58,45 @@ class MLUUniformRandomKernel : public framework::OpKernel { tensor->mutable_data(ctx.GetPlace()); int64_t size = tensor->numel(); - const float min = static_cast(ctx.Attr("min")); - const float max = static_cast(ctx.Attr("max")); + + Tensor cpu_tensor(tensor->dtype()); + cpu_tensor.Resize(tensor->dims()); + T *data_cpu = cpu_tensor.mutable_data(platform::CPUPlace()); + + std::uniform_real_distribution dist( + static_cast(ctx.Attr("min")), + static_cast(ctx.Attr("max"))); unsigned int seed = static_cast(ctx.Attr("seed")); - // make mlu seed - MLUCnnlRandomGeneratorDesc random_desc(/*is_mlu200=*/false, seed); - cnnlDataType_t data_type = ToCnnlDataType(tensor->type()); - MLUCnnl::RandomUniform(ctx, size, /*data type=*/data_type, - random_desc.get(), min, max, GetBasePtr(tensor)); + auto engine = framework::GetCPURandomEngine(seed); + + for (int64_t i = 0; i < size; ++i) { + data_cpu[i] = dist(*engine); + } + + unsigned int diag_num = + static_cast(ctx.Attr("diag_num")); + unsigned int diag_step = + static_cast(ctx.Attr("diag_step")); + auto diag_val = static_cast(ctx.Attr("diag_val")); + if (diag_num > 0) { + PADDLE_ENFORCE_GT( + size, (diag_num - 1) * (diag_step + 1), + platform::errors::InvalidArgument( + "ShapeInvalid: the diagonal's elements is equal (num-1) " + "* (step-1) with num %d, step %d," + "It should be smaller than %d, but received %d", + diag_num, diag_step, (diag_num - 1) * (diag_step + 1), size)); + for (int64_t i = 0; i < diag_num; ++i) { + int64_t pos = i * diag_step + i; + data_cpu[pos] = diag_val; + } + } + + // copy to MLU + framework::TensorCopy( + cpu_tensor, ctx.GetPlace(), + ctx.template device_context(), tensor); + ctx.template device_context().Wait(); } }; diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index 5ab2004617810b34276632fa487e8f12d7b3b915..1be8f3387dbad85e0dce3593ad61b9c116b10ef0 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -236,7 +236,6 @@ register_unity_group(cc scatter_nd_add_op.cc scatter_op.cc seed_op.cc - segment_pool_op.cc select_input_op.cc select_output_op.cc) register_unity_group(cc @@ -496,8 +495,7 @@ register_unity_group(cu scale_op.cu scatter_nd_add_op.cu scatter_op.cu - seed_op.cu - segment_pool_op.cu) + seed_op.cu) register_unity_group(cu roi_pool_op.cu selu_op.cu diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc index 3e11c952d15f3460f987f6fa2cb28970f97cc96b..a8ced783744a961eb8ce64983de7e9615763c1b6 100644 --- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc +++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/viterbi_decode_op.cc b/paddle/fluid/operators/viterbi_decode_op.cc index bf1cdeed65a8427c19410347209faa099673cb7c..602376d54e0d2a49b6cf4f6a78d332154c188a7e 100644 --- a/paddle/fluid/operators/viterbi_decode_op.cc +++ b/paddle/fluid/operators/viterbi_decode_op.cc @@ -9,8 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/viterbi_decode_op.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -19,47 +21,6 @@ class ViterbiDecodeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasInput("Transition"), "Input", "Transition", - "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasInput("Length"), "Input", "Length", "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasOutput("Scores"), "Output", "Scores", - "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasOutput("Path"), "Output", "Path", "ViterbiDecode"); - auto in_dims = ctx->GetInputDim("Input"); - PADDLE_ENFORCE_EQ(in_dims.size(), 3, - platform::errors::InvalidArgument( - "The rank of Input in ViterbiDecode must be 3. But " - "received Input's rank is %d.", - in_dims.size())); - auto length_dims = ctx->GetInputDim("Length"); - PADDLE_ENFORCE_EQ(length_dims.size(), 1, - platform::errors::InvalidArgument( - "The rank of Length in ViterbiDecode must be 1. But " - "received Length's rank is %d.", - length_dims.size())); - auto transition_dims = ctx->GetInputDim("Transition"); - PADDLE_ENFORCE_EQ( - transition_dims.size(), 2, - platform::errors::InvalidArgument( - "The rank of Transition in ViterbiDecode must be 2. But " - "received Transition's rank is %d.", - transition_dims.size())); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ( - in_dims[0], length_dims[0], - platform::errors::InvalidArgument( - "The batch size of Input and Length should be equal.")); - PADDLE_ENFORCE_EQ(in_dims[2], transition_dims[0], - platform::errors::InvalidArgument( - "The number of tags of Input (%d) and Transition " - "(%d) should be equal.", - transition_dims[0], in_dims[2])); - } - ctx->SetOutputDim("Scores", length_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -102,8 +63,8 @@ class ViterbiDecodeOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; namespace platform = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(viterbi_decode, ViterbiDecodeInferShapeFunctor, + PD_INFER_META(phi::ViterbiDecodeInferMeta)); REGISTER_OP_WITHOUT_GRADIENT(viterbi_decode, ops::ViterbiDecodeOp, - ops::ViterbiDecodeOpMaker); -REGISTER_OP_CPU_KERNEL( - viterbi_decode, ops::ViterbiDecodeKernel, - ops::ViterbiDecodeKernel); + ops::ViterbiDecodeOpMaker, + ViterbiDecodeInferShapeFunctor); diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu deleted file mode 100644 index 3c546dd8156a2bdffc9615d171d4630faf3bb7fb..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/viterbi_decode_op.cu +++ /dev/null @@ -1,202 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/elementwise_functor.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" -#include "paddle/fluid/operators/gather.cu.h" -#include "paddle/fluid/operators/viterbi_decode_op.h" - -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif - -namespace paddle { -namespace operators { - -#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \ - case (1 << (log2_block_dim)): { \ - constexpr auto kBlockDim = (1 << (log2_block_dim)); \ - __VA_ARGS__; \ - } break - -#define FIXED_BLOCK_DIM_CASE(...) \ - FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); - -int64_t ComputeBlockSize(int64_t col) { - if (col > 512) - return 1024; - else if (col > 256) - return 512; - else if (col > 128) - return 256; - else if (col > 64) - return 128; - else if (col > 32) - return 64; - else if (col > 16) - return 32; - else if (col > 8) - return 16; - else - return 8; -} - -template